Following are the necessary libraries imported

library(ggplot2)
library(caret)
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
library(ranger)
library(rpart)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(Matrix)
library(DiagrammeR)
library(e1071)

options(warn = -1) 
#Reading the csv file
insurance.data.dup <- read.csv("~/Documents/GitHub/GitHub/Insurance-Claim-Prediction/data/insurance.csv")
insurance.data <- insurance.data.dup

str(insurance.data)
## 'data.frame':    1338 obs. of  8 variables:
##  $ age           : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex           : int  0 1 1 1 1 0 0 0 1 0 ...
##  $ bmi           : num  27.9 33.8 33 22.7 28.9 ...
##  $ children      : int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ region        : int  3 2 2 1 1 2 2 1 0 1 ...
##  $ charges       : num  16885 1726 4449 21984 3867 ...
##  $ insuranceclaim: int  1 1 0 0 1 0 1 0 0 0 ...
nrow(insurance.data)
## [1] 1338

There are total of 1338 observations with the explanatory variables such as age, sex, bmi, children, smoker, region, charges as well as the response variables such as insuranceclaim

#Data Preprocessing

Following code removes the duplicates from the data if there are any.

insurance.data <- unique(insurance.data)
nrow(insurance.data)
## [1] 1337

Count of the data is 1337 after applying the unique function on the data.

print("sex")
## [1] "sex"
table(insurance.data$sex)
## 
##   0   1 
## 662 675
print("children")
## [1] "children"
table(insurance.data$children)
## 
##   0   1   2   3   4   5 
## 573 324 240 157  25  18
print("smoker")
## [1] "smoker"
table(insurance.data$smoker)
## 
##    0    1 
## 1063  274
print("region")
## [1] "region"
table(insurance.data$region)
## 
##   0   1   2   3 
## 324 324 364 325

Predictors variables Sex, children, smoker and region contians intermediate levels such as 2, 6, 2, 4 respectively. There are total of 662 females and 675 males. Total count of non smokers is 1063 when compared to the smokers of 274. There are total of 4 regions and their mapping is as follows northeast=0, northwest=1, southeast=2, southwest=3 with the count of 324, 324, 364 and 325 respectively. Children feature indicates the number of children or the dependent’s where as no dependents are 573, 1 dependent’s as 324, 2 dependent’s as 240, 3 dependent’s as 157, 4 dependents as 25 and 5 as 18.

table(insurance.data$insuranceclaim)
## 
##   0   1 
## 555 782

Response variables contains two level such as 0 and 1 where o indicates the no claim and 1 indicates claim and their counts are 555 and 782 respectively.

Following displays the unique values of each explanatory variable

sapply(insurance.data, function(x) unique(x))
## $age
##  [1] 19 18 28 33 32 31 46 37 60 25 62 23 56 27 52 30 34 59 63 55 22 26 35 24 41
## [26] 38 36 21 48 40 58 53 43 64 20 61 44 57 29 45 54 49 47 51 42 50 39
## 
## $sex
## [1] 0 1
## 
## $bmi
##   [1] 27.900 33.770 33.000 22.705 28.880 25.740 33.440 27.740 29.830 25.840
##  [11] 26.220 26.290 34.400 39.820 42.130 24.600 30.780 23.845 40.300 35.300
##  [21] 36.005 32.400 34.100 31.920 28.025 27.720 23.085 32.775 17.385 36.300
##  [31] 35.600 26.315 28.600 28.310 36.400 20.425 32.965 20.800 36.670 39.900
##  [41] 26.600 36.630 21.780 30.800 37.050 37.300 38.665 34.770 24.530 35.200
##  [51] 35.625 33.630 28.000 34.430 28.690 36.955 31.825 31.680 22.880 37.335
##  [61] 27.360 33.660 24.700 25.935 22.420 28.900 39.100 36.190 23.980 24.750
##  [71] 28.500 28.100 32.010 27.400 34.010 29.590 35.530 39.805 26.885 38.285
##  [81] 37.620 41.230 34.800 22.895 31.160 27.200 26.980 39.490 24.795 31.300
##  [91] 38.280 19.950 19.300 31.600 25.460 30.115 29.920 27.500 28.400 30.875
## [101] 27.940 35.090 29.700 35.720 32.205 28.595 49.060 27.170 23.370 37.100
## [111] 23.750 28.975 31.350 33.915 28.785 28.300 37.400 17.765 34.700 26.505
## [121] 22.040 35.900 25.555 28.050 25.175 31.900 36.000 32.490 25.300 29.735
## [131] 38.830 30.495 37.730 37.430 24.130 37.145 39.520 24.420 27.830 36.850
## [141] 39.600 29.800 29.640 28.215 37.000 33.155 18.905 41.470 30.300 15.960
## [151] 33.345 37.700 27.835 29.200 26.410 30.690 41.895 30.900 32.200 32.110
## [161] 31.570 26.200 30.590 32.800 18.050 39.330 32.230 24.035 36.080 22.300
## [171] 26.400 31.800 26.730 23.100 23.210 33.700 33.250 24.640 33.880 38.060
## [181] 41.910 31.635 36.195 17.800 24.510 22.220 38.390 29.070 22.135 26.800
## [191] 30.020 35.860 20.900 17.290 34.210 25.365 40.150 24.415 25.200 26.840
## [201] 24.320 42.350 19.800 32.395 30.200 29.370 34.200 27.455 27.550 20.615
## [211] 24.300 31.790 21.560 28.120 40.565 27.645 31.200 26.620 48.070 36.765
## [221] 33.400 45.540 28.820 22.990 27.700 25.410 34.390 22.610 37.510 38.000
## [231] 33.330 34.865 33.060 35.970 31.400 25.270 40.945 34.105 36.480 33.800
## [241] 36.700 36.385 34.500 32.300 27.600 29.260 35.750 23.180 25.600 35.245
## [251] 43.890 20.790 30.500 21.700 21.890 24.985 32.015 30.400 21.090 22.230
## [261] 32.900 24.890 31.460 17.955 30.685 43.340 39.050 30.210 31.445 19.855
## [271] 31.020 38.170 20.600 47.520 20.400 38.380 24.310 23.600 21.120 30.030
## [281] 17.480 20.235 17.195 23.900 35.150 35.640 22.600 39.160 27.265 29.165
## [291] 16.815 33.100 26.900 33.110 31.730 46.750 29.450 32.680 33.500 43.010
## [301] 36.520 26.695 25.650 29.600 38.600 23.400 46.530 30.140 30.000 38.095
## [311] 28.380 28.700 33.820 24.090 32.670 25.100 32.560 41.325 39.500 34.300
## [321] 31.065 21.470 25.080 43.400 25.700 27.930 39.200 26.030 30.250 28.930
## [331] 35.700 35.310 31.000 44.220 26.070 25.800 39.425 40.480 38.900 47.410
## [341] 35.435 46.700 46.200 21.400 23.800 44.770 32.120 29.100 37.290 43.120
## [351] 36.860 34.295 23.465 45.430 23.650 20.700 28.270 35.910 29.000 19.570
## [361] 31.130 21.850 40.260 33.725 29.480 32.600 37.525 23.655 37.800 19.000
## [371] 21.300 33.535 42.460 38.950 36.100 29.300 39.700 38.190 42.400 34.960
## [381] 42.680 31.540 29.810 21.375 40.810 17.400 20.300 18.500 26.125 41.690
## [391] 24.100 36.200 40.185 39.270 34.870 44.745 29.545 23.540 40.470 40.660
## [401] 36.600 35.400 27.075 28.405 21.755 40.280 30.100 32.100 23.700 35.500
## [411] 29.150 27.000 37.905 22.770 22.800 34.580 27.100 19.475 26.700 34.320
## [421] 24.400 41.140 22.515 41.800 26.180 42.240 26.510 35.815 41.420 36.575
## [431] 42.940 21.010 24.225 17.670 31.500 31.100 32.780 32.450 50.380 47.600
## [441] 25.400 29.900 43.700 24.860 28.800 29.500 29.040 38.940 44.000 20.045
## [451] 40.920 35.100 29.355 32.585 32.340 39.800 24.605 33.990 28.200 25.000
## [461] 33.200 23.200 20.100 32.500 37.180 46.090 39.930 35.800 31.255 18.335
## [471] 42.900 26.790 39.615 25.900 25.745 28.160 23.560 40.500 35.420 39.995
## [481] 34.675 20.520 23.275 36.290 32.700 19.190 20.130 23.320 45.320 34.600
## [491] 18.715 21.565 23.000 37.070 52.580 42.655 21.660 32.000 18.300 47.740
## [501] 22.100 19.095 31.240 29.925 20.350 25.850 42.750 18.600 23.870 45.900
## [511] 21.500 30.305 44.880 41.100 40.370 28.490 33.550 40.375 27.280 17.860
## [521] 33.300 39.140 21.945 24.970 23.940 34.485 21.800 23.300 36.960 21.280
## [531] 29.400 27.300 37.900 37.715 23.760 25.520 27.610 27.060 39.400 34.900
## [541] 22.000 30.360 27.800 53.130 39.710 32.870 44.700 30.970
## 
## $children
## [1] 0 1 3 2 5 4
## 
## $smoker
## [1] 1 0
## 
## $region
## [1] 3 2 1 0
## 
## $charges
##    [1] 16884.924  1725.552  4449.462 21984.471  3866.855  3756.622  8240.590
##    [8]  7281.506  6406.411 28923.137  2721.321 27808.725  1826.843 11090.718
##   [15] 39611.758  1837.237 10797.336  2395.172 10602.385 36837.467 13228.847
##   [22]  4149.736  1137.011 37701.877  6203.902 14001.134 14451.835 12268.632
##   [29]  2775.192 38711.000 35585.576  2198.190  4687.797 13770.098 51194.559
##   [36]  1625.434 15612.193  2302.300 39774.276 48173.361  3046.062  4949.759
##   [43]  6272.477  6313.759  6079.672 20630.284  3393.356  3556.922 12629.897
##   [50] 38709.176  2211.131  3579.829 23568.272 37742.576  8059.679 47496.494
##   [57] 13607.369 34303.167 23244.790  5989.524  8606.217  4504.662 30166.618
##   [64]  4133.642 14711.744  1743.214 14235.072  6389.378  5920.104 17663.144
##   [71] 16577.780  6799.458 11741.726 11946.626  7726.854 11356.661  3947.413
##   [78]  1532.470  2755.021  6571.024  4441.213  7935.291 37165.164 11033.662
##   [85] 39836.519 21098.554 43578.939 11073.176  8026.667 11082.577  2026.974
##   [92] 10942.132 30184.937  5729.005 47291.055  3766.884 12105.320 10226.284
##   [99] 22412.648 15820.699  6186.127  3645.089 21344.847 30942.192  5003.853
##  [106] 17560.380  2331.519  3877.304  2867.120 47055.532 10825.254 11881.358
##  [113]  4646.759  2404.734 11488.317 30259.996 11381.325 19107.780  8601.329
##  [120]  6686.431  7740.337  1705.624  2257.475 39556.495 10115.009  3385.399
##  [127] 17081.080  9634.538 32734.186  6082.405 12815.445 13616.359 11163.568
##  [134]  1632.564  2457.211  2155.682  1261.442  2045.685 27322.734  2166.732
##  [141] 27375.905  3490.549 18972.495 18157.876 20745.989  5138.257 40720.551
##  [148]  9877.608 10959.695  1842.519  5125.216  7789.635  6334.344 19964.746
##  [155]  7077.189  6948.701 21223.676 15518.180 36950.257 19749.383 21348.706
##  [162] 36149.484 10450.552  5152.134  5028.147 10407.086  4830.630  6128.797
##  [169]  2719.280  4827.905 13405.390  8116.680  1694.796  5246.047  2855.438
##  [176] 48824.450  6455.863 10436.096  8823.279  8538.288 11735.879  1631.821
##  [183]  4005.423  7419.478  7731.427 43753.337  3981.977  5325.651  6775.961
##  [190]  4922.916 12557.605  4883.866  2137.654 12044.342  1137.470  1639.563
##  [197]  5649.715  8516.829  9644.253 14901.517  2130.676  8871.152 13012.209
##  [204] 37133.898  7147.105  4337.735 11743.299 20984.094 13880.949  6610.110
##  [211]  1980.070  8162.716  3537.703  5002.783  8520.026  7371.772 10355.641
##  [218]  2483.736  3392.977 25081.768  5012.471 10564.885  5253.524 34779.615
##  [225] 19515.542 11987.168  2689.495 24227.337  7358.176  9225.256  7443.643
##  [232] 14001.287  1727.785 12333.828  6710.192 19444.266  1615.767  4463.205
##  [239] 17352.680  7152.671 38511.628  5354.075 35160.135  7196.867 29523.166
##  [246] 24476.479 12648.703  1986.933  1832.094  4040.558 12829.455 47305.305
##  [253] 44260.750  4260.744 41097.162 13047.332 43921.184  5400.980 11520.100
##  [260] 33750.292 11837.160 17085.268 24869.837 36219.405 20462.998 46151.124
##  [267] 17179.522 14590.632  7441.053  9282.481  1719.436 42856.838  7265.703
##  [274]  9617.662  2523.169  9715.841  2803.698  2150.469 12928.791  9855.131
##  [281] 22331.567 48549.178  4237.127 11879.104  9625.920  7742.110  9432.925
##  [288] 14256.193 47896.791 25992.821  3172.018 20277.808 42112.236  2156.752
##  [295]  3906.127  1704.568 16297.846 21978.677 38746.355  9249.495  6746.743
##  [302] 24873.385 12265.507  4349.462 12646.207 19442.354 20177.671  4151.029
##  [309] 11944.594  7749.156  8444.474  1737.376 42124.515  8124.408 34838.873
##  [316]  9722.770  8835.265 10435.065  7421.195  4667.608  4894.753 24671.663
##  [323] 35491.640 11566.301  2866.091  6600.206  3561.889 42760.502 47928.030
##  [330]  9144.565 48517.563 24393.622 13429.035 11658.379 19144.577 13822.803
##  [337] 12142.579 13937.666 41919.097  8232.639 18955.220 13352.100 13217.094
##  [344] 13981.850 10977.206  6184.299  4889.999  8334.458  5478.037  1635.734
##  [351] 11830.607  8932.084  3554.203 12404.879 14133.038 24603.048  8944.115
##  [358]  9620.331  1837.282  1607.510 10043.249  4751.070 13844.506  2597.779
##  [365]  3180.510  9778.347 13430.265  8017.061  8116.269  3481.868 13415.038
##  [372] 12029.287  7639.417 36085.219  1391.529 18033.968 21659.930 38126.247
##  [379] 16455.708 27000.985 15006.579 42303.692 20781.489  5846.918  8302.536
##  [386]  1261.859 11856.412 30284.643  3176.816  4618.080 10736.871  2138.071
##  [393]  8964.061  9290.139  9411.005  7526.706  8522.003 16586.498 14988.432
##  [400]  1631.668  9264.797  8083.920 14692.669 10269.460  3260.199 11396.900
##  [407]  4185.098  8539.671  6652.529  4074.454  1621.340 19594.810 14455.644
##  [414]  5080.096  2134.901  7345.727  9140.951 18608.262 14418.280 28950.469
##  [421] 46889.261 46599.108 39125.332  2727.395  8968.330  9788.866  6555.070
##  [428]  7323.735  3167.456 18804.752 23082.955  4906.410  5969.723 12638.195
##  [435]  4243.590 13919.823  2254.797  5926.846 12592.534  2897.323  4738.268
##  [442] 37079.372  1149.396 28287.898 26109.329  7345.084 12731.000 11454.022
##  [449]  5910.944  4762.329  7512.267  4032.241  1969.614  1769.532  4686.389
##  [456] 21797.000 11881.970 11840.775 10601.412  7682.670 10381.479 22144.032
##  [463] 15230.324 11165.418  1632.036 19521.968 13224.693 12643.378 23288.928
##  [470]  2201.097  2497.038  2203.472  1744.465 20878.784 25382.297 28868.664
##  [477] 35147.528  2534.394  1534.304  1824.285 15555.189  9304.702  1622.188
##  [484]  9880.068  9563.029  4347.023 12475.351  1253.936 48885.136 10461.979
##  [491]  1748.774 24513.091  2196.473 12574.049 17942.106  1967.023  4931.647
##  [498]  8027.968  8211.100 13470.860 36197.699  6837.369 22218.115 32548.340
##  [505]  5974.385  6796.863  2643.269  3077.095  3044.213 11455.280 11763.001
##  [512]  2498.414  9361.327  1256.299 21082.160 11362.755 27724.289  8413.463
##  [519]  5240.765  3857.759 25656.575  3994.178  9866.305  5397.617 38245.593
##  [526] 11482.635 24059.680  9861.025  8342.909  1708.001 48675.518 14043.477
##  [533] 12925.886 19214.706 13831.115  6067.127  5972.378  8825.086  8233.097
##  [540] 27346.042  6196.448  3056.388 13887.204 63770.428 10231.500 23807.241
##  [547]  3268.847 11538.421  3213.622 45863.205 13390.559  3972.925 12957.118
##  [554] 11187.657 17878.901  3847.674  8334.590  3935.180 39983.426  1646.430
##  [561]  9193.838 10923.933  2494.022  9058.730  2801.259  2128.431  6373.557
##  [568]  7256.723 11552.904 45702.022  3761.292  2219.445  4753.637 31620.001
##  [575] 13224.057 12222.898  1665.000 58571.074  9724.530  3206.491 12913.992
##  [582]  6356.271 17626.240  1242.816  4779.602  3861.210 43943.876 13635.638
##  [589]  5976.831 11842.442  8428.069  2566.471 15359.104  5709.164  8823.986
##  [596]  7640.309  5594.846  7441.501 33471.972  1633.044  9174.136 11070.535
##  [603] 16085.128 17468.984  9283.562  3558.620 25678.778  4435.094 39241.442
##  [610]  8547.691  6571.544  2207.697  6753.038  1880.070 42969.853 11658.115
##  [617] 23306.547 34439.856 10713.644  3659.346 40182.246  9182.170 34617.841
##  [624] 12129.614  3736.465  6748.591 11326.715 11365.952 42983.459 10085.846
##  [631]  1977.815  3366.670  7173.360  9391.346 14410.932  2709.112 24915.046
##  [638] 20149.323 12949.155  6666.243 32787.459 13143.865  4466.621 18806.145
##  [645] 10141.136  6123.569  8252.284  1712.227 12430.953  9800.888 10579.711
##  [652]  8280.623  8527.532 12244.531 24667.419  3410.324  4058.712 26392.260
##  [659] 14394.398  6435.624 22192.437  5148.553  1136.399 27037.914 42560.430
##  [666]  8703.456 40003.332 45710.208  6500.236  4837.582  3943.595  4399.731
##  [673]  6185.321 46200.985  7222.786 12485.801 46130.526 12363.547 10156.783
##  [680]  2585.269  1242.260 40103.890  9863.472  4766.022 11244.377  7729.646
##  [687]  5438.749 26236.580 34806.468  2104.113  8068.185  2362.229  2352.968
##  [694]  3577.999  3201.245 29186.482 40273.645 10976.246  3500.612  2020.552
##  [701]  9541.696  9504.310  5385.338  8930.935  5375.038 44400.406 10264.442
##  [708]  6113.231  5469.007  1727.540 10107.221  8310.839  1984.453  2457.502
##  [715] 12146.971  9566.991 13112.605 10848.134 12231.614  9875.680 11264.541
##  [722] 12979.358  1263.249 10106.134 40932.429  6664.686 16657.717  2217.601
##  [729]  6781.354 19361.999 10065.413  4234.927  9447.250 14007.222  9583.893
##  [736] 40419.019  3484.331 36189.102 44585.456  8604.484 18246.496 43254.418
##  [743]  3757.845  8827.210  9910.360 11737.849  1627.282  8556.907  3062.508
##  [750] 19539.243  1906.358 14210.536 11833.782 17128.426  5031.270  7985.815
##  [757] 23065.421  5428.728 36307.798  3925.758  2416.955 19040.876  3070.809
##  [764]  9095.068 11842.624  8062.764  7050.642 14319.031  6933.242 27941.288
##  [771] 11150.780 12797.210 17748.506  7261.741 10560.492  6986.697  7448.404
##  [778]  5934.380  9869.810 18259.216  1146.797  9386.161 24520.264  4350.514
##  [785]  6414.178 12741.167  1917.318  5209.579 13457.961  5662.225  1252.407
##  [792]  2731.912 21195.818  7209.492 18310.742  4266.166  4719.524 11848.141
##  [799] 17904.527  7046.722 14313.846  2103.080 38792.686  1815.876  7731.858
##  [806] 28476.735  2136.882  1131.507  3309.793  9414.920  6360.994 11013.712
##  [813]  4428.888  5584.306  1877.929  2842.761  3597.596 23401.306 55135.402
##  [820]  7445.918  2680.949  1621.883  8219.204 12523.605 16069.085 43813.866
##  [827] 20773.628 39597.407  6117.494 13393.756  5266.366  4719.737 11743.934
##  [834]  5377.458  7160.330  4402.233 11657.719  6402.291 12622.180  1526.312
##  [841] 12323.936 36021.011 27533.913 10072.055 45008.955  9872.701  2438.055
##  [848]  2974.126 10601.632 37270.151 14119.620 42111.665 11729.680 24106.913
##  [855]  1875.344 40974.165 15817.986 18218.161 10965.446 46113.511  7151.092
##  [862] 12269.689  5458.046  8782.469  6600.361  1141.445 11576.130 13129.603
##  [869]  4391.652  8457.818  3392.365  5966.887  6849.026  8891.139  2690.114
##  [876] 26140.360  6653.789  6282.235  6311.952  3443.064  2789.057  2585.851
##  [883] 46255.113  4877.981 19719.695 27218.437  5272.176  1682.597 11945.133
##  [890] 29330.983  7243.814 10422.917 44202.654 13555.005 13063.883 19798.055
##  [897]  2221.564  1634.573  2117.339  8688.859 48673.559  4661.286  8125.784
##  [904] 12644.589  4564.191  4846.920  7633.721 15170.069 17496.306  2639.043
##  [911] 33732.687 14382.709  7626.993  5257.508  2473.334 21774.322 35069.375
##  [918] 13041.921  5245.227 13451.122 13462.520  5488.262  4320.411  6250.435
##  [925] 25333.333  2913.569 12032.326 13470.804  6289.755  2927.065  6238.298
##  [932] 10096.970  7348.142  4673.392 12233.828 32108.663  8965.796  2304.002
##  [939]  9487.644  1121.874  9549.565  2217.469  1628.471 12982.875 11674.130
##  [946]  7160.094 39047.285  6358.776 19933.458 11534.873 47462.894  4527.183
##  [953] 38998.546 20009.634  3875.734 41999.520 12609.887 41034.221 28468.919
##  [960]  2730.108  3353.284 14474.675  9500.573 26467.097  4746.344 23967.383
##  [967]  7518.025  3279.869  8596.828 10702.642  4992.376  2527.819  1759.338
##  [974]  2322.622 16138.762  7804.160  2902.907  9704.668  4889.037 25517.114
##  [981]  4500.339 19199.944 16796.412  4915.060  7624.630  8410.047 28340.189
##  [988]  4518.826 14571.891  3378.910  7144.863 10118.424  5484.467 16420.495
##  [995]  7986.475  7418.522 13887.969  6551.750  5267.818 17361.766 34472.841
## [1002]  1972.950 21232.182  8627.541  4433.388  4438.263 24915.221 23241.475
## [1009]  9957.722  8269.044 18767.738 36580.282  8765.249  5383.536 12124.992
## [1016]  2709.244  3987.926 12495.291 26018.951  8798.593 35595.590 42211.138
## [1023]  1711.027  8569.862  2020.177 16450.895 21595.382  9850.432  6877.980
## [1030] 21677.283 44423.803  4137.523 13747.872 12950.071 12094.478 37484.449
## [1037] 39725.518  2250.835 22493.660 20234.855  1704.700 33475.817  3161.454
## [1044] 11394.066 21880.820  7325.048 44501.398  3594.171 39727.614  8023.135
## [1051] 14394.558  9288.027 25309.489  3353.470 10594.502  8277.523 17929.303
## [1058]  2480.979  4462.722  1981.582 11554.224 48970.248  6548.195  5708.867
## [1065]  7045.499  8978.185  5757.413 14349.854 10928.849 39871.704 13974.456
## [1072]  1909.527 12096.651 13204.286  4562.842  8551.347  2102.265 34672.147
## [1079] 15161.534 11884.049  4454.403  5855.903  4076.497 15019.760 19023.260
## [1086] 10796.350 11353.228  9748.911 10577.087 41676.081 11286.539  3591.480
## [1093] 33907.548 11299.343  4561.189 44641.197  1674.632 23045.566  3227.121
## [1100] 16776.304 11253.421  3471.410 11363.283 20420.605 10338.932  8988.159
## [1107] 10493.946  2904.088  8605.362 11512.405 41949.244 24180.933  5312.170
## [1114]  2396.096 10807.486  9222.403 36124.574 38282.749  5693.431 34166.273
## [1121]  8347.164 46661.442 18903.491 40904.200 14254.608 10214.636  5836.520
## [1128] 14358.364  1728.897  8582.302  3693.428 20709.020  9991.038 19673.336
## [1135] 11085.587  7623.518  3176.288  3704.354 36898.733  9048.027  7954.517
## [1142] 27117.994  6338.076  9630.397 11289.109 52590.829  2261.569 10791.960
## [1149]  5979.731  2203.736 12235.839 40941.285  5630.458 11015.175  7228.216
## [1156] 39722.746 14426.074  2459.720  3989.841  7727.253  5124.189 18963.172
## [1163]  2200.831  7153.554  5227.989 10982.501  4529.477  4670.640  6112.353
## [1170] 17178.682 22478.600 11093.623  6457.843  4433.916  2154.361 23887.663
## [1177]  6496.886  2899.489 19350.369  7650.774  2850.684  2632.992  9447.382
## [1184] 18328.238  8603.823 37465.344 13844.797 21771.342 13126.677  5327.400
## [1191] 13725.472 13019.161  8671.191  4134.082 18838.704 33307.551  5699.837
## [1198]  6393.603  4934.705  6198.752  8733.229  2055.325  9964.060 18223.451
## [1205]  5116.500 36910.608 38415.474 20296.863 12347.172  5373.364 23563.016
## [1212]  1702.455 10806.839  3956.071 12890.058  5415.661  4058.116 41661.602
## [1219]  7537.164  4718.204  6593.508  8442.667 26125.675  6858.480  4795.657
## [1226]  6640.545  7162.012 10594.226 11938.256 60021.399 20167.336 12479.709
## [1233] 11345.519  8515.759  2699.568 14449.854 12224.351  6985.507  3238.436
## [1240] 47269.854 49577.662  4296.271  3171.615  1135.941  5615.369  9101.798
## [1247]  6059.173  1633.962 37607.528 18648.422  1241.565 16232.847 15828.822
## [1254]  4415.159  6474.013 11436.738 11305.935 30063.581 10197.772  4544.235
## [1261]  3277.161  6770.193  7337.748 10370.913 26926.514 10704.470 34254.053
## [1268]  1880.487  8615.300  3292.530  3021.809 14478.330  4747.053 17043.341
## [1275] 10959.330  2741.948  4357.044 22462.044  4189.113  8283.681 24535.699
## [1282] 14283.459  1720.354 47403.880  8534.672  3732.625  5472.449 38344.566
## [1289]  7147.473  7133.903 34828.654  1515.345  9301.894 11931.125  1964.780
## [1296]  1708.926  4340.441  5261.469  2710.829 62592.873 46718.163  3208.787
## [1303] 37829.724 21259.378  2464.619 16115.305 21472.479 33900.653  6875.961
## [1310]  6940.910  4571.413  4536.259 36397.576 18765.875 11272.331  1731.677
## [1317]  1163.463 19496.719  7201.701  5425.023 28101.333 12981.346 43896.376
## [1324]  4239.893 13143.337  7050.021  9377.905 22395.744 10325.206 12629.166
## [1331] 10795.937 11411.685 10600.548  2205.981  1629.833  2007.945 29141.360
## 
## $insuranceclaim
## [1] 1 0

Following code is to conver the columns sex, children, smoker, insuranceclaim and region columns to factor levels

column_names <- c(
     "sex", "children", "smoker", "insuranceclaim", "region"
)

# Convert the selected columns to factors in our data frame
insurance.data[, column_names] <- lapply(insurance.data[, column_names], as.factor)

str(insurance.data)
## 'data.frame':    1337 obs. of  8 variables:
##  $ age           : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex           : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi           : num  27.9 33.8 33 22.7 28.9 ...
##  $ children      : Factor w/ 6 levels "0","1","2","3",..: 1 2 4 1 1 1 2 4 3 1 ...
##  $ smoker        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region        : Factor w/ 4 levels "0","1","2","3": 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges       : num  16885 1726 4449 21984 3867 ...
##  $ insuranceclaim: Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 1 1 ...

Here, we are checking for the null values in any of the column of the dataframe as they could mislead our model’s we fit and predictions.

any(is.na(insurance.data))
## [1] FALSE
colSums(is.na(insurance.data)) > 0
##            age            sex            bmi       children         smoker 
##          FALSE          FALSE          FALSE          FALSE          FALSE 
##         region        charges insuranceclaim 
##          FALSE          FALSE          FALSE

There are no such records in out dataframe

Following histogram displays the distribution of the “claims” as well as the “no claims”. No claims were acconted to total of 41.5% of the response variables and claims were total of 58.5%.

percentage_data <- table(insurance.data$insuranceclaim) / nrow(insurance.data) * 100

# Create a data frame for plotting
plot_data <- data.frame(insuranceclaim = as.factor(names(percentage_data)),
                        percentage = as.numeric(percentage_data))

# Plotting
ggplot(plot_data, aes(x = insuranceclaim, y = percentage)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  geom_text(aes(label = sprintf("%.1f%%", percentage)),
            position = position_stack(vjust = 0.5),   # Adjust vjust for vertical position
            color = "black", size = 3) +
  labs(title = "Distribution of Insurance Claims: Non-Claims (0) vs. Claims (1)",
       x = "Insurance Claim",
       y = "Percentage") 

Following code is to split the data into the trian and test split’s in the proportion of 80% and 20%

# Binomial Logit Model - 80-20 split

set.seed(123457)
train.prop <- 0.80
strats <- insurance.data$insuranceclaim
rr <- split(1:length(strats), strats)
idx <- sort(as.numeric(unlist(sapply(rr, 
        function(x) sample(x, length(x)*train.prop)))))
insurance.data.train <- insurance.data[idx, ]
insurance.data.test <- insurance.data[-idx, ]

Following are the distributions of the claims and no claims in the train and test data frames with similar distributions.

#check for equal proportions of number of claims 

table(insurance.data.train$insuranceclaim)/nrow(insurance.data.train)
## 
##         0         1 
## 0.4153414 0.5846586
table(insurance.data.test$insuranceclaim)/nrow(insurance.data.test)
## 
##         0         1 
## 0.4141791 0.5858209

Basis on the response variable with 0’s and 1’s which is a binomial we can apply binary logit model

A binary random variable \(Y\) can assume only one of two possible values, a value of \(1\) (Yes) or a value of \(0\) (No).

A binary random variable \(Y\) has a Bernoulli(\(\pi\)) distribution with

\[ P(Y=1)=\pi = 1-P(Y=0), \] {#eq-bernoulli}

and probability mass function (p.m.f.)

\[ p(y; \pi) = \pi^y (1-\pi)^{1-y},~y = 0 \mbox{ or } 1; 0 \le \pi \le 1. \] {#eq-pmfBern}

A useful transformation of \(\pi\) is the logit (or, log-odds) transformation:

\[ \text{logit}(\pi) = \log \left(\frac{\pi}{1-\pi}\right) \] {#eq-logitpi}

Note: we looked at log odds in @sec-ch3TwoSamp.

Let \(\eta = \text{logit}(\pi)\). After some algebra, we see that we can uniquely write \(\pi\) as a function of \(\eta\), i.e., the inverse transformation is

\[ \pi = \frac{\exp(\eta)}{1+\exp(\eta)} \] {#eq-invlogit}

Logit Model

The binary logit (or, logistic regression) model is a generalized linear model (GLIM) for explaining binary responses \(Y_i\). Our goal is to model the binary responses as functions of \(p\) independent variables denoted by \(X_{i,j},~j=1,\ldots,p\) for each \(i\).

The random component of the GLIM is

\[ Y_i | \pi_i \sim \mbox{Bernoulli}(\pi_i). \] {#eq-logitglim1}

The systematic component is

\[ \eta_i = \beta_0 + \sum_{j=1}^p \beta_j X_{i,j} = \mathbf{x}_i' \boldsymbol{\beta}. \] {#eq-binarysys}

with \(\boldsymbol{\beta} = (\beta_0, \beta_1, \ldots,\beta_p)'\), and \(\mathbf{x}_i = (1, X_{i,1},\ldots, X_{i,p})'\).

The logit link function relates the \(i\)th mean response \(\pi_i\) to the systematic component \(\eta_i\):

\[ \mbox{logit}(\pi_i | \mathbf{x}_i) = \log\left(\frac{\pi_i}{1-\pi_i}\right) = \eta_i. \] {#eq-logitglim2}

Since the mean response \(\pi_i\) must lie in the interval \((0,1)\), whereas \(\eta_i\) is real-valued, we need a function such as the logit function to link the two in a correct way.

By inverting the logit link function (see @eq-invlogit), we can write the binary logit model as

\[ \pi_i = P(Y_i =1 | \mathbf{x}_i) = \frac{\exp(\beta_0 + \sum_{j=1}^p \beta_j X_{i,j})}{ 1+ \exp(\beta_0 + \sum_{j=1}^p \beta_j X_{i,j})}. \] {#eq-logitglim3}

  1. Full Binary logit model was fitted on the data where the response variable is insuranceclaims and the remaining variables such as sex, children, bmi, smokers, age, region and charges.

Following are the null and alternative hypothesis.

Null Hypothesis (\(H_0\)): \[ H_0: \beta_j = 0 \]

The null hypothesis asserts that there is no association between the independent variable \(X_j\) and the log-odds of the dependent variable being in the “success” category.

Alternative Hypothesis (\(H_1\)): \[ H_1: \beta_j \neq 0 \]

The alternative hypothesis suggests that the independent variable \(X_j\) does have a significant association with the log-odds of the event.

#full binary logit model
full.logit <- glm(insuranceclaim ~ . ,data = insurance.data.train, 
                  family = binomial(link = "logit"))
summary(full.logit)
## 
## Call:
## glm(formula = insuranceclaim ~ ., family = binomial(link = "logit"), 
##     data = insurance.data.train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.733e+00  6.882e-01 -11.237  < 2e-16 ***
## age          3.137e-02  8.573e-03   3.659 0.000253 ***
## sex1         6.626e-02  1.845e-01   0.359 0.719451    
## bmi          2.790e-01  2.178e-02  12.807  < 2e-16 ***
## children1   -2.183e+00  2.413e-01  -9.046  < 2e-16 ***
## children2   -3.487e+00  2.951e-01 -11.815  < 2e-16 ***
## children3   -4.880e+00  4.021e-01 -12.137  < 2e-16 ***
## children4   -5.051e+00  7.472e-01  -6.760 1.38e-11 ***
## children5   -3.880e+00  8.897e-01  -4.361 1.29e-05 ***
## smoker1      4.112e+00  4.710e-01   8.730  < 2e-16 ***
## region1     -3.719e-01  2.617e-01  -1.421 0.155270    
## region2     -3.596e-01  2.668e-01  -1.348 0.177625    
## region3     -3.058e-01  2.628e-01  -1.164 0.244625    
## charges      1.022e-05  1.749e-05   0.584 0.559263    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1451.15  on 1068  degrees of freedom
## Residual deviance:  751.29  on 1055  degrees of freedom
## AIC: 779.29
## 
## Number of Fisher Scoring iterations: 6

Logistic Regression Results Interpretation

Coefficients

Intercept

The intercept is -7.733. When all predictor variables are zero, the log-odds of not making an insurance claim is -7.733.

Age

For each one-unit increase in age, the log-odds of making an insurance claim increase by 0.0314 (p < 0.001).

Sex

The coefficient for ‘sex1’ is 0.06626 with a p-value of 0.719. It is not statistically significant, suggesting that gender may not be a significant predictor of insurance claims.

BMI

For each one-unit increase in BMI, the log-odds of making an insurance claim increase by 0.279 (p < 0.001).

Children

The coefficients for ‘children1’ through ‘children5’ represent the effect of having 1 to 5 children compared to having no children. As the number of children increases, the log-odds of making an insurance claim decrease significantly.

Smoker

Smokers (smoker1) have higher log-odds of making an insurance claim compared to non-smokers (4.112, p < 0.001).

Region

The coefficients for ‘region1’, ‘region2’, and ‘region3’ represent the effect of regions 1, 2, and 3 compared to region 4. None of the regions are statistically significant.

Charges

The coefficient for ‘charges’ is not statistically significant (p = 0.559), suggesting that charges may not be a significant predictor.

Odds Ratios (Exponentiated Coefficients)

  • Age: The odds of making an insurance claim increase by approximately 3.51% for each one-year increase in age.
  • BMI: The odds of making an insurance claim increase by approximately 32.37% for each one-unit increase in BMI.
  • Children: Compared to having no children, the odds of making an insurance claim decrease significantly as the number of children increases.
  • Smoker: Smokers have approximately 61.2 times higher odds of making an insurance claim compared to non-smokers.

Model Fit

  • The model significantly improves the fit compared to the null model (p < 2e-16).
  • AIC (Akaike Information Criterion) is 779.29, a measure of model performance.

Dispersion Parameter

  • The dispersion parameter is 1, indicating that the binomial distribution is appropriate for the data.

Deviance

  • The null deviance (1451.15) represents the deviance when considering only the intercept.
  • The residual deviance (751.29) is the deviance after fitting the model with predictors. A lower deviance indicates a better fit.

Conclusion

  • Age, BMI, number of children, and smoking status appear to be significant predictors of insurance claims.
  • Charges and region may not be statistically significant predictors in this model.
car::qqPlot(residuals(full.logit), main = NA, pch = 19, col = 2, cex = 0.7)

## 1227  681 
##  982  554
shapiro.test(residuals(full.logit))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(full.logit)
## W = 0.97353, p-value = 4.624e-13

From the above residual plot we can observe most of the data points were normal except few data points however according to the shapiro wilk statistical test we can confirm that the data is not normal.

Following null binary model was fitted with no predictors with the repsonse variable

#null binary logit model
null.logit <- glm(insuranceclaim ~ 1 ,data = insurance.data.train, 
                  family = binomial(link = "logit"))
summary(null.logit)
## 
## Call:
## glm(formula = insuranceclaim ~ 1, family = binomial(link = "logit"), 
##     data = insurance.data.train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.34193    0.06207   5.509 3.61e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1451.2  on 1068  degrees of freedom
## Residual deviance: 1451.2  on 1068  degrees of freedom
## AIC: 1453.2
## 
## Number of Fisher Scoring iterations: 4

The intercept is 0.34193 with a standard error of 0.06207. This represents the log-odds of the baseline category (insurance claim = 0) when there are no predictor variables. The coefficient is statistically significant (p-value < 0.001).

The intercept represents the log-odds of the baseline category (insurance claim = 0) when no predictor variables are included in the model.

The model with only the intercept doesn’t provide much information about the relationship between predictors and the response variable. It serves as a baseline against which more complex models can be compared.

The AIC is relatively high, suggesting that models with additional predictors might provide a better fit to the data.

Since there are no predictor variables, the model is essentially stating that the log-odds of making an insurance claim when there are no predictors is 0.34193.

car::qqPlot(residuals(null.logit), main = NA, pch = 19, col = 2, cex = 0.7)

## 6 8 
## 3 5
shapiro.test(residuals(null.logit))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(null.logit)
## W = 0.62609, p-value < 2.2e-16

From the above residual plot we can observe most of the data points were normal except few data points however according to the shapiro wilk statistical test we can confirm that the data is not normal as we reject the null hypothesis . To select the variables which are impacting the response variable we applied the vairable selection method on top of the full logit model. Direction was set to both.

both.logit <- step(null.logit, list(lower = formula(null.logit),
                                    upper = formula(full.logit)),
                   direction = "both", trace = 0, data = insurance.data.train)

formula(both.logit)
## insuranceclaim ~ children + bmi + smoker + age
summary(both.logit)
## 
## Call:
## glm(formula = insuranceclaim ~ children + bmi + smoker + age, 
##     family = binomial(link = "logit"), data = insurance.data.train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.916898   0.665702 -11.893  < 2e-16 ***
## children1   -2.184042   0.240776  -9.071  < 2e-16 ***
## children2   -3.478784   0.294150 -11.827  < 2e-16 ***
## children3   -4.868815   0.399380 -12.191  < 2e-16 ***
## children4   -5.008194   0.747916  -6.696 2.14e-11 ***
## children5   -3.886841   0.892521  -4.355 1.33e-05 ***
## bmi          0.276118   0.021134  13.065  < 2e-16 ***
## smoker1      4.281770   0.374253  11.441  < 2e-16 ***
## age          0.034454   0.007052   4.885 1.03e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1451.15  on 1068  degrees of freedom
## Residual deviance:  754.42  on 1060  degrees of freedom
## AIC: 772.42
## 
## Number of Fisher Scoring iterations: 6

The model suggests that the number of children, BMI, smoking status, and age are significant predictors of insurance claims.

Smokers are associated with a significant increase in the likelihood of making an insurance claim.

Older individuals (higher age) are associated with a slight increase in the likelihood of making an insurance claim.

The model provides a significantly better fit than the null model, as evidenced by the lower residual deviance and AIC.

Interpret the coefficients cautiously. For example, the interpretation of the number of children assumes linearity, and interactions or nonlinear effects may be present.

car::qqPlot(residuals(both.logit), main = NA, pch = 19, col = 2, cex = 0.7)

## 1227  429 
##  982  347
shapiro.test(residuals(both.logit))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(both.logit)
## W = 0.9734, p-value = 4.245e-13

According to the above residual plot the data was not normal

There are outliers after fitting the model, let’s the model by eliminating the residuals which are having the variation greater than 3 times of standard deviation. Following is the code implementation

extpts <- which(abs(residuals(both.logit)) > 3*sd(residuals(both.logit)))
nrow(insurance.data.train)
## [1] 1069
length(extpts)
## [1] 15
data.train.2 <- insurance.data.train[-extpts,]
full.logit <- glm(insuranceclaim ~ . ,data = data.train.2, 
                  family = binomial(link = "logit"))
both.logit.extpts <- step(full.logit, 
                   direction="both",trace=0, data = data.train.2)
formula(both.logit.extpts)
## insuranceclaim ~ age + bmi + children + smoker
summary(both.logit.extpts)
## 
## Call:
## glm(formula = insuranceclaim ~ age + bmi + children + smoker, 
##     family = binomial(link = "logit"), data = data.train.2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.666757   0.852947 -12.506  < 2e-16 ***
## age           0.038961   0.007922   4.918 8.74e-07 ***
## bmi           0.371495   0.027749  13.388  < 2e-16 ***
## children1    -2.696091   0.278386  -9.685  < 2e-16 ***
## children2    -4.272616   0.353996 -12.070  < 2e-16 ***
## children3    -7.124781   0.575888 -12.372  < 2e-16 ***
## children4    -7.670886   1.080693  -7.098 1.26e-12 ***
## children5    -4.539262   1.005026  -4.517 6.29e-06 ***
## smoker1       5.928896   0.513490  11.546  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1434.90  on 1053  degrees of freedom
## Residual deviance:  610.43  on 1045  degrees of freedom
## AIC: 628.43
## 
## Number of Fisher Scoring iterations: 7
car::qqPlot(residuals(both.logit.extpts), main = NA, pch = 19, col = 2, cex = 0.7)

## 660 232 
## 528 182
shapiro.test(residuals(both.logit.extpts))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(both.logit.extpts)
## W = 0.96129, p-value = 4.199e-16

From the residual plot we see the data points are deviated and are not normal.

The Akaike Information Criterion (AIC) is an information criterion used for model selection. For a model with \(p\) estimated parameters, it is defined as

\[ \text{AIC} = -2 \ell(\hat{\boldsymbol{\beta}};\mathbf{y}) + 2p. \] {#eq-AIC}

While we wish to select a model with largest maximized log-likelihood, AIC penalizes us for using a model with an unnecessarily large \(p\), the penalty term being \(2p\).

Let’s compare the AIC values

#Akaike Information Criterion
AIC(both.logit)
## [1] 772.4223
AIC(full.logit)
## [1] 634.5677
AIC(null.logit)
## [1] 1453.154

From the above values we can observe full.logit model aic values as less which compared to others

Another useful information based model selection criterion is called the Bayesian Information Criterion (BIC), which uses a different penalty \(p\log(n)\): \[ \text{BIC} = -2 \ell(\hat{\boldsymbol{\beta}};\mathbf{y}) + p \log(n) \] {#eq-BIC}

Again, a model with smaller BIC is better which is full.logit model in comparison to the both logit model.

#Baysian Information Criteria
BIC(both.logit)
## [1] 817.1926
BIC(full.logit)
## [1] 704.0126
BIC(null.logit)
## [1] 1458.129

Let’s predict the values of the test data set by the help of predict function by passing the model as well as the dataset along side the type as response where it will automatically takes care of the logit conversions.

Test Data Accuracy

Predictions with the help of the both.logit as well as the full.logit model.

pred.both.test <- predict(both.logit, newdata = insurance.data.test, type="response")
pred.full.test <- predict(full.logit, newdata = insurance.data.test, type="response")
(table.both <- table(pred.both.test > 0.5, insurance.data.test$insuranceclaim))
##        
##           0   1
##   FALSE  90  18
##   TRUE   21 139
(table.full <- table(pred.full.test > 0.5, insurance.data.test$insuranceclaim))
##        
##           0   1
##   FALSE  95  16
##   TRUE   16 141
(accuracy.both <- round((sum(diag(table.both))/sum(table.both))*100,2)) 
## [1] 85.45
(accuracy.full <- round((sum(diag(table.full))/sum(table.full))*100,2))
## [1] 88.06

We can observe that the accuracy of the test data by the full model is more which compared to the both model.

ROC curve. Another useful metric is area under the receiver operating characteristics (ROC) curve, which used to evaluate the prediction accuracy in binary and multi-class classification.

It quantifies the trade-off between the sensitivity or true positive rate (TPR) and specificity or false positive rate (FPR) of a prediction.

Sensitivity, or true positive (TP) is the probability that a binary response is predicted as a 1 (or, yes), given that it is an event (or, yes).

Specificity, or true negative (TN) is the probability that a binary response is predicted as a 0 (or, no), given that it is a non-event (or, no).

par(mfrow = c(1,2))
roc.both <- roc(insurance.data.test$insuranceclaim ~ pred.both.test, plot = TRUE, 
                legacy.axes = TRUE, print.auc = TRUE, main = "Both Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.both <- roc(insurance.data.test$insuranceclaim ~ pred.full.test, plot = TRUE, 
                legacy.axes = TRUE, print.auc = TRUE, main = "Full Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Train Data Accuracy

Following are the predictions, accuracy and roc curves on the train data.

pred.both <- predict(both.logit, newdata = insurance.data.train, type="response")
pred.full <- predict(full.logit, newdata = insurance.data.train, type="response")
(table.both <- table(pred.both > 0.5, insurance.data.train$insuranceclaim))
##        
##           0   1
##   FALSE 371  62
##   TRUE   73 563
(table.full <- table(pred.full > 0.5, insurance.data.train$insuranceclaim))
##        
##           0   1
##   FALSE 384  63
##   TRUE   60 562
(accuracy.both <- round((sum(diag(table.both))/sum(table.both))*100,2)) 
## [1] 87.37
(accuracy.full <- round((sum(diag(table.full))/sum(table.full))*100,2))
## [1] 88.49
par(mfrow = c(1,2))
roc.both <- roc(insurance.data.train$insuranceclaim ~ pred.both, plot = TRUE, 
                legacy.axes = TRUE, print.auc = TRUE, main = "Both Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.both <- roc(insurance.data.train$insuranceclaim ~ pred.full, plot = TRUE, 
                legacy.axes = TRUE, print.auc = TRUE, main = "Full Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

We can observe that the train and test data accuracy were similar for the full model where as there is a slight difference between them in the both model. ROC curve looks similar for both as well as the full model.

##backward

Model was fitted based on the backward direction and the aic value is 628 which is less when compared to the both model.

both.logit.backward <- step(full.logit, 
                   direction="backward",trace=0, data = insurance.data.train)
formula(both.logit.backward)
## insuranceclaim ~ age + bmi + children + smoker
summary(both.logit.backward)
## 
## Call:
## glm(formula = insuranceclaim ~ age + bmi + children + smoker, 
##     family = binomial(link = "logit"), data = data.train.2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.666757   0.852947 -12.506  < 2e-16 ***
## age           0.038961   0.007922   4.918 8.74e-07 ***
## bmi           0.371495   0.027749  13.388  < 2e-16 ***
## children1    -2.696091   0.278386  -9.685  < 2e-16 ***
## children2    -4.272616   0.353996 -12.070  < 2e-16 ***
## children3    -7.124781   0.575888 -12.372  < 2e-16 ***
## children4    -7.670886   1.080693  -7.098 1.26e-12 ***
## children5    -4.539262   1.005026  -4.517 6.29e-06 ***
## smoker1       5.928896   0.513490  11.546  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1434.90  on 1053  degrees of freedom
## Residual deviance:  610.43  on 1045  degrees of freedom
## AIC: 628.43
## 
## Number of Fisher Scoring iterations: 7

Features such as age, bmi, children and smoker are the ones which were identified by the backward model as more relevant in predicting the response variable.

However, residual plot does not loks good as it indicates the data points are not normal.

car::qqPlot(residuals(both.logit.backward), main = NA, pch = 19, col = 2, cex = 0.7)

## 660 232 
## 528 182
shapiro.test(residuals(both.logit.backward))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(both.logit.backward)
## W = 0.96129, p-value = 4.199e-16

##forward

Following is the code for the forward elimination method and it’s AIC value is 634 which is slightly greater than the backward model.

both.logit.forward <- step(full.logit, 
                   direction="forward",trace=0, data = insurance.data.train)
formula(both.logit.forward)
## insuranceclaim ~ age + sex + bmi + children + smoker + region + 
##     charges
summary(both.logit.forward)
## 
## Call:
## glm(formula = insuranceclaim ~ age + sex + bmi + children + smoker + 
##     region + charges, family = binomial(link = "logit"), data = data.train.2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.052e+01  8.758e-01 -12.009  < 2e-16 ***
## age          3.617e-02  9.719e-03   3.722 0.000198 ***
## sex1         7.141e-02  2.071e-01   0.345 0.730266    
## bmi          3.777e-01  2.861e-02  13.199  < 2e-16 ***
## children1   -2.702e+00  2.799e-01  -9.653  < 2e-16 ***
## children2   -4.288e+00  3.559e-01 -12.049  < 2e-16 ***
## children3   -7.162e+00  5.819e-01 -12.308  < 2e-16 ***
## children4   -7.697e+00  1.087e+00  -7.078 1.46e-12 ***
## children5   -4.529e+00  1.005e+00  -4.508 6.55e-06 ***
## smoker1      5.805e+00  6.032e-01   9.624  < 2e-16 ***
## region1     -4.717e-01  2.935e-01  -1.607 0.108100    
## region2     -4.886e-01  2.982e-01  -1.639 0.101253    
## region3     -3.348e-01  2.931e-01  -1.142 0.253393    
## charges      8.773e-06  1.982e-05   0.443 0.658053    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1434.90  on 1053  degrees of freedom
## Residual deviance:  606.57  on 1040  degrees of freedom
## AIC: 634.57
## 
## Number of Fisher Scoring iterations: 7

Features such as age, sex, bmi, children, region, charges and smoker are the ones which were identified by the backward model as more relevant in predicting the response variable.

car::qqPlot(residuals(both.logit.forward), main = NA, pch = 19, col = 2, cex = 0.7)

## 232 660 
## 182 528
shapiro.test(residuals(both.logit.forward))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(both.logit.forward)
## W = 0.96088, p-value = 3.394e-16

However, residual plot does not loks good as it indicates the data points are not normal.

Following are the predictions, accuracy of the test data for both forward and backward elimination models.

pred.both.forward <- predict(both.logit.forward, newdata = insurance.data.test, type="response")
pred.both.backward <- predict(both.logit.backward, newdata = insurance.data.test, type="response")
(table.both.forward <- table(pred.both.forward > 0.5, insurance.data.test$insuranceclaim))
##        
##           0   1
##   FALSE  95  16
##   TRUE   16 141
(table.full.backward <- table(pred.both.backward > 0.5, insurance.data.test$insuranceclaim))
##        
##           0   1
##   FALSE  92  18
##   TRUE   19 139
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2)) 
## [1] 88.06
(accuracy.full.backward <- round((sum(diag(table.full.backward))/sum(table.full.backward))*100,2))
## [1] 86.19

We can observe the ac curacies of both the forward and backward elimination methods are similar to that of both elimination methods.

Testing Strategy 2 - K-Fold Validation

K-fold cross-validation is a resampling technique commonly used in machine learning to assess the performance and generalization ability of a predictive model. The basic idea is to partition the dataset into k subsets (folds), train the model on k-1 folds, and evaluate it on the remaining fold. This process is repeated k times, with each of the k folds used exactly once as the validation data.

Following code implements the k-fold validation with 10 folds on the full model and the average accuracy obtained is 87%(approx).

# Binomial Full Logit Model - K fold validaton

# Set the number of folds (K)
num_folds <- 10

# Create an index vector for splitting
set.seed(123)  # for reproducibility
indices <- createFolds(insurance.data$insuranceclaim, k = num_folds, list = TRUE)

# Initialize a variable to store cross-validation results
cv_results <- data.frame(Accuracies = double(num_folds))

# Perform K-Fold Cross-Validation
for (i in 1:num_folds) {
  # Split the data into training and testing sets
  train_data <- insurance.data[-indices[[i]], ]
  test_data <- insurance.data[indices[[i]], ]
  
  model <- glm(insuranceclaim ~ . ,data = train_data, 
                  family = binomial(link = "logit"))
  
  # Make predictions on the test data
  predictions <- predict(model, newdata = test_data,  type="response")
  
  (table.full <- table(predictions > 0.5, test_data$insuranceclaim))

  acc <- round((sum(diag(table.full))/sum(table.full))*100,2)

  # Store the RMSE in the results dataframe
  cv_results$Accuracies[i] <- acc
}

# Display cross-validation results
print(cv_results)
##    Accuracies
## 1       86.47
## 2       87.31
## 3       87.41
## 4       84.96
## 5       87.22
## 6       85.07
## 7       85.82
## 8       84.33
## 9       93.23
## 10      86.57
cv_results$Accuracies <- as.numeric(cv_results$Accuracies)
mean_accuracy <- mean(cv_results$Accuracies, na.rm = TRUE)
mean_accuracy
## [1] 86.839
# Binomial Logit Model - K fold validation

Following code implements the k-fold validation with 10 folds on the both model and the average accuracy obtained is 88%.

# Binomial Both Logit Model - K fold validaton

# Set the number of folds (K)
num_folds <- 10

# Create an index vector for splitting
set.seed(123)  # for reproducibility
indices <- createFolds(insurance.data$insuranceclaim, k = num_folds, list = TRUE)

# Initialize a variable to store cross-validation results
cv_results <- data.frame(Accuracies = double(num_folds))

# Perform K-Fold Cross-Validation
for (i in 1:num_folds) {
  # Split the data into training and testing sets
  train_data <- insurance.data[-indices[[i]], ]
  test_data <- insurance.data[indices[[i]], ]
  
  # Fit our model on the training data
  model <- step(full.logit, 
                   direction="both",trace=0, data = train_data)
  
  # Make predictions on the test data
  predictions <- predict(model, newdata = test_data,  type="response")
  
  (table.full <- table(predictions > 0.5, test_data$insuranceclaim))

  acc <- round((sum(diag(table.full))/sum(table.full))*100,2)

  # Store the RMSE in the results dataframe
  cv_results$Accuracies[i] <- acc
}

# Display cross-validation results
print(cv_results)
##    Accuracies
## 1       86.47
## 2       88.06
## 3       88.15
## 4       85.71
## 5       88.72
## 6       86.57
## 7       88.81
## 8       85.82
## 9       93.23
## 10      88.81
cv_results$Accuracies <- as.numeric(cv_results$Accuracies)
mean_accuracy <- mean(cv_results$Accuracies, na.rm = TRUE)
mean_accuracy
## [1] 88.035
# Binomial Logit Model - K fold validation

The probit link function is an alternative link function.

Starting with the standard normal c.d.f \(\phi(z)\) which lies in the interval \([0, 1]\), the probit (or inverse normal c.d.f.) link assumes that

\[ \phi^{-1}(\pi_i) = \eta_i \] {#eq-probit1}

so that

\[ \pi_i = \Phi(\eta_i) \] {#eq-probit2}

where \(\eta_i\) is given by @eq-binarysys as

\[ \eta_i = \beta_0 + \sum_{j=1}^p \beta_j X_{i,j} = \mathbf{x}_i' \boldsymbol{\beta}. \] Null Hypothesis (\(H_0\)): \[ H_0: \beta_j = 0 \]

The null hypothesis asserts that there is no association between the independent variable \(X_j\) and the probability of the dependent variable being in the “success” category.

Alternative Hypothesis (\(H_1\)): \[ H_1: \beta_j \neq 0 \]

The alternative hypothesis suggests that the independent variable \(X_j\) does have a significant association with the probability of the event.

# Probit Full Model

full.probit <- glm(insuranceclaim ~ . ,data = insurance.data.train , 
                   family = binomial(link = "probit"))
summary(full.probit)
## 
## Call:
## glm(formula = insuranceclaim ~ ., family = binomial(link = "probit"), 
##     data = insurance.data.train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.012e+00  3.591e-01 -11.173  < 2e-16 ***
## age          1.821e-02  4.694e-03   3.879 0.000105 ***
## sex1         3.719e-02  1.028e-01   0.362 0.717537    
## bmi          1.458e-01  1.108e-02  13.158  < 2e-16 ***
## children1   -1.232e+00  1.325e-01  -9.295  < 2e-16 ***
## children2   -1.923e+00  1.561e-01 -12.317  < 2e-16 ***
## children3   -2.571e+00  2.026e-01 -12.692  < 2e-16 ***
## children4   -2.623e+00  3.826e-01  -6.857 7.02e-12 ***
## children5   -2.107e+00  4.526e-01  -4.654 3.25e-06 ***
## smoker1      2.162e+00  2.518e-01   8.585  < 2e-16 ***
## region1     -2.164e-01  1.460e-01  -1.482 0.138392    
## region2     -1.982e-01  1.496e-01  -1.325 0.185238    
## region3     -1.820e-01  1.475e-01  -1.234 0.217249    
## charges      6.872e-06  9.576e-06   0.718 0.472970    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1451.15  on 1068  degrees of freedom
## Residual deviance:  765.01  on 1055  degrees of freedom
## AIC: 793.01
## 
## Number of Fisher Scoring iterations: 7

Intercept ((Intercept)): The coefficient is -4.012, indicating the log-odds of the outcome when all predictors are zero. It is significantly negative, suggesting a lower likelihood of making an insurance claim.

Age (age): A one-unit increase in age is associated with an increase in the log-odds of making an insurance claim by 0.01821. This effect is statistically significant.

Sex (sex1): The coefficient is not statistically significant at the 0.05 significance level, suggesting that gender may not be a significant predictor of insurance claims.

BMI (bmi): A one-unit increase in BMI is associated with an increase in the log-odds of making an insurance claim by 0.1458. This effect is statistically significant.

Children (children1, children2, children3, children4, children5): The number of children has a significant negative impact on the log-odds of making an insurance claim.

Smoker (smoker1): Being a smoker is associated with an increase in the log-odds of making an insurance claim by 2.162. This effect is highly significant.

Region (region1, region2, region3): The coefficients for regions are not statistically significant, suggesting that region may not be a significant predictor of insurance claims.

Charges (charges): The coefficient is not statistically significant at the 0.05 significance level, indicating that charges may not be a significant predictor of insurance claims.

car::qqPlot(residuals(full.probit), main = NA, pch = 19, col = 2, cex = 0.7)

## 1227  681 
##  982  554
shapiro.test(residuals(full.probit))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(full.probit)
## W = 0.97872, p-value = 2.062e-11

From the resiudal plot we can ibserve that the data points are not normal which we can also proven statistically by shapiro-wilk test.

Again, a model with smaller BIC is better which is full.logit model in comparison to the both logit model.

Following are the train and test accuracies

#train data accuracy
pred.both <- predict(full.probit, newdata = insurance.data.train, type="response")
(table.both <- table(pred.both > 0.5, insurance.data.train$insuranceclaim))
##        
##           0   1
##   FALSE 364  56
##   TRUE   80 569
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2)) 
## [1] 88.06
#test data accuracy
pred.both <- predict(full.probit, newdata = insurance.data.test, type="response")
(table.both <- table(pred.both > 0.5, insurance.data.test$insuranceclaim))
##        
##           0   1
##   FALSE  87  17
##   TRUE   24 140
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2)) 
## [1] 88.06

Test data accuracy of the models are similar to the logit model. Also, both the train and test data accuracy is same

Following is the code to fit the probit model on the reduced predictors such as age, bmi, children, smoker

# Probit Model

full.predictors.probit <- glm(insuranceclaim ~ age + bmi + children + smoker ,data = insurance.data.train , 
                   family = binomial(link = "probit"))
summary(full.predictors.probit)
## 
## Call:
## glm(formula = insuranceclaim ~ age + bmi + children + smoker, 
##     family = binomial(link = "probit"), data = insurance.data.train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.124238   0.344506 -11.971  < 2e-16 ***
## age          0.020223   0.003899   5.187 2.14e-07 ***
## bmi          0.144275   0.010687  13.500  < 2e-16 ***
## children1   -1.232168   0.132227  -9.319  < 2e-16 ***
## children2   -1.912556   0.155338 -12.312  < 2e-16 ***
## children3   -2.568484   0.201282 -12.761  < 2e-16 ***
## children4   -2.583550   0.380364  -6.792 1.10e-11 ***
## children5   -2.092116   0.452774  -4.621 3.83e-06 ***
## smoker1      2.279723   0.191072  11.931  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1451.15  on 1068  degrees of freedom
## Residual deviance:  768.43  on 1060  degrees of freedom
## AIC: 786.43
## 
## Number of Fisher Scoring iterations: 7

The logistic regression model with link is probit provides insights into the factors influencing the likelihood of making an insurance claim. The intercept, set at -4.124238, represents the baseline log-odds when all other predictors are zero. As individuals age, the log-odds of making a claim increase by 0.020223 for each additional year. Similarly, a rise in BMI corresponds to a log-odds increase of 0.144275 for making an insurance claim.

The presence of children plays a notable role. Having one, two, three, four, or five children results in log-odds reductions of -1.232168, -1.912556, -2.568484, -2.583550, and -2.092116, respectively, in the likelihood of making a claim. In other words, the number of children inversely affects the probability of an insurance claim.

On the other hand, being a smoker significantly elevates the log-odds by 2.279723, indicating a substantial increase in the likelihood of making an insurance claim for smokers. These insights provide a nuanced understanding of how different factors contribute to the complex dynamics of insurance claim predictions.

car::qqPlot(residuals(full.predictors.probit), main = NA, pch = 19, col = 2, cex = 0.7)

## 1227  429 
##  982  347
shapiro.test(residuals(full.predictors.probit))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(full.predictors.probit)
## W = 0.97837, p-value = 1.574e-11

Above plot illustrates that the data points are not normal as there are deviations from the normal line.

pred.both.train.probit <- predict(full.predictors.probit, newdata = insurance.data.train, type="response")
(table.both <- table(pred.both.train.probit > 0.5, insurance.data.train$insuranceclaim))
##        
##           0   1
##   FALSE 366  57
##   TRUE   78 568
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2)) 
## [1] 88.06
pred.both.test.probit <- predict(full.predictors.probit, newdata = insurance.data.test, type="response")
(table.both <- table(pred.both.test.probit > 0.5, insurance.data.test$insuranceclaim))
##        
##           0   1
##   FALSE  88  15
##   TRUE   23 142
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2)) 
## [1] 88.06

The classification model demonstrates an accuracy of 88.06%, signifying the proportion of accurate predictions relative to the total predictions. Notably, there are 142 instances of true positives, denoting cases where the model correctly predicted class 1, and 88 instances of true negatives, indicating accurate predictions for class 0. On the other hand, the model incurred 15 false positives, where it predicted class 1, but the actual class was 0, and 23 false negatives, representing cases where the model predicted class 0, but the true class was 1.

par(mfrow = c(1,2))

# Training ROC curve
roc.both.train <- roc(insurance.data.train$insuranceclaim ~ pred.both.train.probit, plot = TRUE, 
                      legacy.axes = TRUE, print.auc = TRUE, main = "Training ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Testing ROC curve
roc.both.test <- roc(insurance.data.test$insuranceclaim ~ pred.both.test.probit, plot = TRUE, 
                     legacy.axes = TRUE, print.auc = TRUE, main = "Testing ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# Binomial Full Probit Model - K fold validaton


# Set the number of folds (K)
num_folds <- 10

# Create an index vector for splitting
set.seed(123)  # for reproducibility
indices <- createFolds(insurance.data$insuranceclaim, k = num_folds, list = TRUE)

# Initialize a variable to store cross-validation results
cv_results <- data.frame(Accuracies = double(num_folds))

# Perform K-Fold Cross-Validation
for (i in 1:num_folds) {
  # Split the data into training and testing sets
  train_data <- insurance.data[-indices[[i]], ]
  test_data <- insurance.data[indices[[i]], ]
  
  # Fit our model on the training data
  model <- glm(insuranceclaim ~ . ,data = train_data, 
                  family = binomial(link = "probit"))
  
  # Make predictions on the test data
  predictions <- predict(model, newdata = test_data,  type="response")
  
  (table.full <- table(predictions > 0.5, test_data$insuranceclaim))

  acc <- round((sum(diag(table.full))/sum(table.full))*100,2)

  # Store the RMSE in the results dataframe
  cv_results$Accuracies[i] <- acc
}

# Display cross-validation results
print(cv_results)
##    Accuracies
## 1       86.47
## 2       86.57
## 3       86.67
## 4       84.21
## 5       87.22
## 6       84.33
## 7       85.07
## 8       84.33
## 9       92.48
## 10      85.82
cv_results$Accuracies <- as.numeric(cv_results$Accuracies)
mean_accuracy <- mean(cv_results$Accuracies, na.rm = TRUE)
mean_accuracy
## [1] 86.317
# Binomial Logit Model - K fold validation

The model demonstrates relatively consistent performance across different folds, with accuracy ranging from 84.33% to 91.04%. This suggests that the model is not heavily dependent on a specific subset of the data and maintains its predictive capability across various scenarios.The mean accuracy of approximately 87.30% indicates that, on average, the model correctly predicts whether an individual will make an insurance claim in about 87.30% of cases. This suggests a reasonably effective predictive performance.

3. Classification and Regression Trees

We define two impurity measures, Gini index and entropy, for classifying a response with \(J\) categories.

The Gini index is defined by

\[ \mbox{Gini index} = 1 - \sum_{j=1}^J p^2_j, \] {#eq-gini-index}

where \(p_j = P(Y \in \mbox{class} j),~j=1,\ldots,J\). Gini index lies in \([0, 1]\). The value \(0\) denotes a pure classification where all the cases belong to a single class, while \(1\) indicates a random distribution of cases across the \(J\) classes. A Gini index of \(0.5\) shows an equal distribution of cases over some classes.

Entropy is an alternate impurity measure which also lies in \([0,1]\):

\[ \mbox{Entropy} = \sum_{j=1}^J-p_j \log_2 (p_j). \] {#eq-entropy}

The rpart package uses the Gini index as the impurity index and minimizes a cost

\[ \text{Cost}_{CP}(\mbox{Tree}) = \mbox{Error(Tree)} + Cp \ \mathcal{N}(\mbox{Tree}), \]

where, Error(Tree) is the fraction of misclassified cases and \(\mathcal{N}(\mbox{Tree})\) is the number of leaf nodes in the tree.

# Classification and Regression Trees

insurance.data.dup <- read.csv("~/Documents/GitHub/GitHub/Insurance-Claim-Prediction/data/insurance.csv")
insurance.data <- insurance.data.dup
set.seed(12345)
train.prop <- 0.80
strats <- insurance.data$insuranceclaim
rr <- split(1:length(strats), strats)
idx <- sort(as.numeric(unlist(sapply(rr, 
        function(x) sample(x, length(x)*train.prop)))))
insurance.data.train <- insurance.data[idx, ]
insurance.data.test <- insurance.data[-idx, ]
table(insurance.data.train$insuranceclaim)/nrow(insurance.data.train)
## 
##         0         1 
## 0.4149533 0.5850467
fit.allp <- rpart(insuranceclaim ~., method = "class", data = insurance.data.train,
                  control = rpart.control(minsplit = 1, cp = 0.001))
summary(fit.allp)
## Call:
## rpart(formula = insuranceclaim ~ ., data = insurance.data.train, 
##     method = "class", control = rpart.control(minsplit = 1, cp = 0.001))
##   n= 1070 
## 
##             CP nsplit   rel error     xerror       xstd
## 1  0.315315315      0 1.000000000 1.00000000 0.03629976
## 2  0.128378378      1 0.684684685 0.71846847 0.03370082
## 3  0.051801802      3 0.427927928 0.43693694 0.02838429
## 4  0.043918919      4 0.376126126 0.42792793 0.02815421
## 5  0.033783784      6 0.288288288 0.32432432 0.02514270
## 6  0.027027027      7 0.254504505 0.29954955 0.02430641
## 7  0.020270270      8 0.227477477 0.25675676 0.02273037
## 8  0.018018018     10 0.186936937 0.23648649 0.02191712
## 9  0.016891892     11 0.168918919 0.19594595 0.02013546
## 10 0.015765766     13 0.135135135 0.19594595 0.02013546
## 11 0.013513514     14 0.119369369 0.17117117 0.01892453
## 12 0.012762763     15 0.105855856 0.16891892 0.01880907
## 13 0.009009009     18 0.067567568 0.10585586 0.01509774
## 14 0.006756757     19 0.058558559 0.10360360 0.01494356
## 15 0.005630631     22 0.038288288 0.08333333 0.01346096
## 16 0.004504505     24 0.027027027 0.08333333 0.01346096
## 17 0.002252252     27 0.013513514 0.05630631 0.01112893
## 18 0.001126126     30 0.006756757 0.05405405 0.01090929
## 19 0.001000000     36 0.000000000 0.05180180 0.01068470
## 
## Variable importance
##      bmi children  charges   smoker      age   region 
##       31       26       20       12       10        1 
## 
## Node number 1: 1070 observations,    complexity param=0.3153153
##   predicted class=1  expected loss=0.4149533  P(node) =1
##     class counts:   444   626
##    probabilities: 0.415 0.585 
##   left son=2 (244 obs) right son=3 (826 obs)
##   Primary splits:
##       bmi      < 25.9825  to the left,  improve=87.44814, (0 missing)
##       children < 0.5      to the right, improve=82.55445, (0 missing)
##       smoker   < 0.5      to the left,  improve=57.01775, (0 missing)
##       charges  < 33047.5  to the left,  improve=45.12917, (0 missing)
##       age      < 41.5     to the left,  improve=16.60421, (0 missing)
## 
## Node number 2: 244 observations,    complexity param=0.0518018
##   predicted class=0  expected loss=0.2131148  P(node) =0.2280374
##     class counts:   192    52
##    probabilities: 0.787 0.213 
##   left son=4 (191 obs) right son=5 (53 obs)
##   Primary splits:
##       smoker  < 0.5      to the left,  improve=34.378990, (0 missing)
##       charges < 14511.86 to the left,  improve=26.570560, (0 missing)
##       bmi     < 17.575   to the right, improve= 7.247083, (0 missing)
##       age     < 63.5     to the left,  improve= 2.344399, (0 missing)
##       region  < 2.5      to the right, improve= 0.787562, (0 missing)
##   Surrogate splits:
##       charges < 14511.86 to the left,  agree=0.939, adj=0.717, (0 split)
##       bmi     < 25.845   to the left,  agree=0.787, adj=0.019, (0 split)
## 
## Node number 3: 826 observations,    complexity param=0.1283784
##   predicted class=1  expected loss=0.3050847  P(node) =0.7719626
##     class counts:   252   574
##    probabilities: 0.305 0.695 
##   left son=6 (473 obs) right son=7 (353 obs)
##   Primary splits:
##       children < 0.5      to the right, improve=114.753100, (0 missing)
##       smoker   < 0.5      to the left,  improve= 30.605070, (0 missing)
##       charges  < 33047.5  to the left,  improve= 24.727040, (0 missing)
##       age      < 41.5     to the left,  improve= 13.870640, (0 missing)
##       bmi      < 31.01    to the left,  improve=  9.525539, (0 missing)
##   Surrogate splits:
##       charges < 3220.372 to the right, agree=0.662, adj=0.210, (0 split)
##       age     < 25.5     to the right, agree=0.646, adj=0.173, (0 split)
##       bmi     < 26.6475  to the right, agree=0.579, adj=0.014, (0 split)
## 
## Node number 4: 191 observations,    complexity param=0.01576577
##   predicted class=0  expected loss=0.07329843  P(node) =0.1785047
##     class counts:   177    14
##    probabilities: 0.927 0.073 
##   left son=8 (184 obs) right son=9 (7 obs)
##   Primary splits:
##       bmi      < 17.575   to the right, improve=12.4802500, (0 missing)
##       charges  < 30225.63 to the left,  improve= 1.7265910, (0 missing)
##       children < 1.5      to the left,  improve= 0.8043285, (0 missing)
##       age      < 63.5     to the left,  improve= 0.7360038, (0 missing)
##       region   < 0.5      to the right, improve= 0.1247432, (0 missing)
## 
## Node number 5: 53 observations,    complexity param=0.01689189
##   predicted class=1  expected loss=0.2830189  P(node) =0.04953271
##     class counts:    15    38
##    probabilities: 0.283 0.717 
##   left son=10 (23 obs) right son=11 (30 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=11.074650, (0 missing)
##       age      < 41       to the left,  improve= 6.509434, (0 missing)
##       charges  < 19479.9  to the left,  improve= 4.805730, (0 missing)
##       bmi      < 19.1975  to the left,  improve= 4.448209, (0 missing)
##       region   < 2.5      to the right, improve= 1.610444, (0 missing)
##   Surrogate splits:
##       bmi     < 24.265   to the right, agree=0.660, adj=0.217, (0 split)
##       region  < 2.5      to the right, agree=0.623, adj=0.130, (0 split)
##       charges < 16717.01 to the right, agree=0.604, adj=0.087, (0 split)
##       age     < 29.5     to the right, agree=0.585, adj=0.043, (0 split)
## 
## Node number 6: 473 observations,    complexity param=0.1283784
##   predicted class=0  expected loss=0.4672304  P(node) =0.4420561
##     class counts:   252   221
##    probabilities: 0.533 0.467 
##   left son=12 (378 obs) right son=13 (95 obs)
##   Primary splits:
##       smoker   < 0.5      to the left,  improve=52.43251, (0 missing)
##       charges  < 30124.26 to the left,  improve=45.66116, (0 missing)
##       age      < 40.5     to the left,  improve=26.26169, (0 missing)
##       bmi      < 31.1925  to the left,  improve=15.70518, (0 missing)
##       children < 2.5      to the right, improve=14.09699, (0 missing)
##   Surrogate splits:
##       charges < 30124.26 to the left,  agree=0.941, adj=0.705, (0 split)
##       bmi     < 26.1525  to the right, agree=0.801, adj=0.011, (0 split)
## 
## Node number 7: 353 observations
##   predicted class=1  expected loss=0  P(node) =0.3299065
##     class counts:     0   353
##    probabilities: 0.000 1.000 
## 
## Node number 8: 184 observations,    complexity param=0.004504505
##   predicted class=0  expected loss=0.03804348  P(node) =0.1719626
##     class counts:   177     7
##    probabilities: 0.962 0.038 
##   left son=16 (157 obs) right son=17 (27 obs)
##   Primary splits:
##       bmi      < 25.3325  to the left,  improve=2.14679700, (0 missing)
##       charges  < 30225.63 to the left,  improve=1.86083400, (0 missing)
##       children < 1.5      to the left,  improve=1.28220600, (0 missing)
##       age      < 63.5     to the left,  improve=0.86299570, (0 missing)
##       region   < 2.5      to the left,  improve=0.08394667, (0 missing)
##   Surrogate splits:
##       charges < 28149.52 to the left,  agree=0.859, adj=0.037, (0 split)
## 
## Node number 9: 7 observations
##   predicted class=1  expected loss=0  P(node) =0.006542056
##     class counts:     0     7
##    probabilities: 0.000 1.000 
## 
## Node number 10: 23 observations,    complexity param=0.01689189
##   predicted class=0  expected loss=0.3478261  P(node) =0.02149533
##     class counts:    15     8
##    probabilities: 0.652 0.348 
##   left son=20 (15 obs) right son=21 (8 obs)
##   Primary splits:
##       age      < 41.5     to the left,  improve=10.4347800, (0 missing)
##       charges  < 19621.16 to the left,  improve= 7.2347830, (0 missing)
##       bmi      < 24.56    to the left,  improve= 2.2501670, (0 missing)
##       children < 2.5      to the right, improve= 0.7732441, (0 missing)
##       region   < 2.5      to the right, improve= 0.5328218, (0 missing)
##   Surrogate splits:
##       charges < 19621.16 to the left,  agree=0.913, adj=0.75, (0 split)
##       bmi     < 24.56    to the left,  agree=0.739, adj=0.25, (0 split)
## 
## Node number 11: 30 observations
##   predicted class=1  expected loss=0  P(node) =0.02803738
##     class counts:     0    30
##    probabilities: 0.000 1.000 
## 
## Node number 12: 378 observations,    complexity param=0.04391892
##   predicted class=0  expected loss=0.3492063  P(node) =0.353271
##     class counts:   246   132
##    probabilities: 0.651 0.349 
##   left son=24 (187 obs) right son=25 (191 obs)
##   Primary splits:
##       age      < 40.5     to the left,  improve=27.8931800, (0 missing)
##       charges  < 7146.168 to the left,  improve=17.2151500, (0 missing)
##       children < 1.5      to the right, improve=13.6844100, (0 missing)
##       bmi      < 31.1925  to the left,  improve=12.0039700, (0 missing)
##       region   < 0.5      to the right, improve= 0.5812166, (0 missing)
##   Surrogate splits:
##       charges  < 6659.237 to the left,  agree=0.897, adj=0.791, (0 split)
##       bmi      < 31.3025  to the left,  agree=0.566, adj=0.123, (0 split)
##       sex      < 0.5      to the right, agree=0.534, adj=0.059, (0 split)
##       region   < 0.5      to the right, agree=0.526, adj=0.043, (0 split)
##       children < 3.5      to the right, agree=0.516, adj=0.021, (0 split)
## 
## Node number 13: 95 observations,    complexity param=0.005630631
##   predicted class=1  expected loss=0.06315789  P(node) =0.08878505
##     class counts:     6    89
##    probabilities: 0.063 0.937 
##   left son=26 (15 obs) right son=27 (80 obs)
##   Primary splits:
##       charges  < 21902.02 to the left,  improve=2.6004390, (0 missing)
##       bmi      < 29.765   to the left,  improve=2.2421050, (0 missing)
##       children < 3.5      to the right, improve=1.7740200, (0 missing)
##       age      < 35.5     to the left,  improve=0.8363513, (0 missing)
##       region   < 1.5      to the left,  improve=0.1132164, (0 missing)
##   Surrogate splits:
##       bmi < 29.765   to the left,  agree=0.905, adj=0.400, (0 split)
##       age < 21       to the left,  agree=0.863, adj=0.133, (0 split)
## 
## Node number 16: 157 observations,    complexity param=0.001126126
##   predicted class=0  expected loss=0.006369427  P(node) =0.146729
##     class counts:   156     1
##    probabilities: 0.994 0.006 
##   left son=32 (147 obs) right son=33 (10 obs)
##   Primary splits:
##       bmi      < 25.1275  to the left,  improve=0.18726110, (0 missing)
##       children < 1.5      to the left,  improve=0.03073941, (0 missing)
##       region   < 0.5      to the right, improve=0.02981434, (0 missing)
##       charges  < 9030.432 to the left,  improve=0.02362478, (0 missing)
##       age      < 44.5     to the left,  improve=0.02004803, (0 missing)
## 
## Node number 17: 27 observations,    complexity param=0.004504505
##   predicted class=0  expected loss=0.2222222  P(node) =0.02523364
##     class counts:    21     6
##    probabilities: 0.778 0.222 
##   left son=34 (19 obs) right son=35 (8 obs)
##   Primary splits:
##       children < 1.5      to the left,  improve=6.3333330, (0 missing)
##       charges  < 9016.941 to the left,  improve=2.4761900, (0 missing)
##       age      < 44.5     to the left,  improve=1.8333330, (0 missing)
##       bmi      < 25.3825  to the right, improve=1.3333330, (0 missing)
##       region   < 1.5      to the left,  improve=0.5333333, (0 missing)
##   Surrogate splits:
##       bmi     < 25.3825  to the right, agree=0.741, adj=0.125, (0 split)
##       charges < 12385.87 to the left,  agree=0.741, adj=0.125, (0 split)
## 
## Node number 20: 15 observations
##   predicted class=0  expected loss=0  P(node) =0.01401869
##     class counts:    15     0
##    probabilities: 1.000 0.000 
## 
## Node number 21: 8 observations
##   predicted class=1  expected loss=0  P(node) =0.007476636
##     class counts:     0     8
##    probabilities: 0.000 1.000 
## 
## Node number 24: 187 observations,    complexity param=0.01351351
##   predicted class=0  expected loss=0.1550802  P(node) =0.1747664
##     class counts:   158    29
##    probabilities: 0.845 0.155 
##   left son=48 (169 obs) right son=49 (18 obs)
##   Primary splits:
##       bmi      < 40.1375  to the left,  improve=10.42547000, (0 missing)
##       children < 1.5      to the right, improve= 8.28593800, (0 missing)
##       charges  < 4170.071 to the right, improve= 3.18811300, (0 missing)
##       age      < 27.5     to the right, improve= 0.66374980, (0 missing)
##       region   < 2.5      to the left,  improve= 0.04327544, (0 missing)
## 
## Node number 25: 191 observations,    complexity param=0.04391892
##   predicted class=1  expected loss=0.460733  P(node) =0.1785047
##     class counts:    88   103
##    probabilities: 0.461 0.539 
##   left son=50 (56 obs) right son=51 (135 obs)
##   Primary splits:
##       children < 2.5      to the right, improve=10.187190, (0 missing)
##       bmi      < 34.49    to the left,  improve= 7.423943, (0 missing)
##       charges  < 11881.66 to the right, improve= 2.994786, (0 missing)
##       age      < 53.5     to the right, improve= 1.934244, (0 missing)
##       region   < 1.5      to the left,  improve= 1.254331, (0 missing)
##   Surrogate splits:
##       age < 62.5     to the right, agree=0.723, adj=0.054, (0 split)
## 
## Node number 26: 15 observations,    complexity param=0.005630631
##   predicted class=1  expected loss=0.3333333  P(node) =0.01401869
##     class counts:     5    10
##    probabilities: 0.333 0.667 
##   left son=52 (5 obs) right son=53 (10 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=6.6666670, (0 missing)
##       charges  < 19826.58 to the right, improve=1.4880950, (0 missing)
##       bmi      < 29.315   to the right, improve=0.9523810, (0 missing)
##       age      < 35.5     to the left,  improve=0.5128205, (0 missing)
##       region   < 2.5      to the right, improve=0.1282051, (0 missing)
##   Surrogate splits:
##       bmi     < 28.535   to the right, agree=0.733, adj=0.2, (0 split)
##       charges < 19826.58 to the right, agree=0.733, adj=0.2, (0 split)
## 
## Node number 27: 80 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.0125  P(node) =0.07476636
##     class counts:     1    79
##    probabilities: 0.013 0.988 
##   left son=54 (9 obs) right son=55 (71 obs)
##   Primary splits:
##       bmi      < 29.07    to the left,  improve=0.19722220, (0 missing)
##       charges  < 33545.31 to the left,  improve=0.14166670, (0 missing)
##       children < 2.5      to the right, improve=0.07023810, (0 missing)
##       age      < 42.5     to the left,  improve=0.03382353, (0 missing)
##       region   < 1.5      to the left,  improve=0.02905405, (0 missing)
##   Surrogate splits:
##       charges < 29526.8  to the left,  agree=0.975, adj=0.778, (0 split)
## 
## Node number 32: 147 observations
##   predicted class=0  expected loss=0  P(node) =0.1373832
##     class counts:   147     0
##    probabilities: 1.000 0.000 
## 
## Node number 33: 10 observations,    complexity param=0.001126126
##   predicted class=0  expected loss=0.1  P(node) =0.009345794
##     class counts:     9     1
##    probabilities: 0.900 0.100 
##   left son=66 (9 obs) right son=67 (1 obs)
##   Primary splits:
##       children < 1.5      to the left,  improve=1.8000000, (0 missing)
##       region   < 0.5      to the right, improve=0.4666667, (0 missing)
##       charges  < 8768.868 to the left,  improve=0.3000000, (0 missing)
##       age      < 43.5     to the left,  improve=0.2000000, (0 missing)
##       sex      < 0.5      to the right, improve=0.2000000, (0 missing)
## 
## Node number 34: 19 observations
##   predicted class=0  expected loss=0  P(node) =0.01775701
##     class counts:    19     0
##    probabilities: 1.000 0.000 
## 
## Node number 35: 8 observations,    complexity param=0.004504505
##   predicted class=1  expected loss=0.25  P(node) =0.007476636
##     class counts:     2     6
##    probabilities: 0.250 0.750 
##   left son=70 (2 obs) right son=71 (6 obs)
##   Primary splits:
##       age      < 38       to the left,  improve=3.0000000, (0 missing)
##       charges  < 7018.252 to the left,  improve=3.0000000, (0 missing)
##       bmi      < 25.7425  to the right, improve=1.6666670, (0 missing)
##       children < 2.5      to the left,  improve=0.6000000, (0 missing)
##       region   < 0.5      to the left,  improve=0.3333333, (0 missing)
##   Surrogate splits:
##       charges < 7018.252 to the left,  agree=1.000, adj=1.0, (0 split)
##       bmi     < 25.7425  to the right, agree=0.875, adj=0.5, (0 split)
## 
## Node number 48: 169 observations,    complexity param=0.01276276
##   predicted class=0  expected loss=0.1005917  P(node) =0.1579439
##     class counts:   152    17
##    probabilities: 0.899 0.101 
##   left son=96 (99 obs) right son=97 (70 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=4.8370250, (0 missing)
##       bmi      < 31.1925  to the left,  improve=4.0863750, (0 missing)
##       charges  < 4170.071 to the right, improve=2.2165900, (0 missing)
##       age      < 32.5     to the right, improve=0.8416554, (0 missing)
##       region   < 1.5      to the right, improve=0.2004161, (0 missing)
##   Surrogate splits:
##       charges < 3393.167 to the right, agree=0.692, adj=0.257, (0 split)
##       bmi     < 28.4525  to the right, agree=0.598, adj=0.029, (0 split)
##       age     < 18.5     to the right, agree=0.592, adj=0.014, (0 split)
## 
## Node number 49: 18 observations,    complexity param=0.006756757
##   predicted class=1  expected loss=0.3333333  P(node) =0.01682243
##     class counts:     6    12
##    probabilities: 0.333 0.667 
##   left son=98 (9 obs) right son=99 (9 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=4.0000000, (0 missing)
##       age      < 28.5     to the right, improve=1.5384620, (0 missing)
##       bmi      < 45.06    to the left,  improve=1.5384620, (0 missing)
##       charges  < 3979.797 to the right, improve=1.5384620, (0 missing)
##       region   < 0.5      to the right, improve=0.2352941, (0 missing)
##   Surrogate splits:
##       bmi     < 42.92    to the right, agree=0.778, adj=0.556, (0 split)
##       age     < 26.5     to the right, agree=0.722, adj=0.444, (0 split)
##       charges < 3551.876 to the right, agree=0.722, adj=0.444, (0 split)
##       region  < 1.5      to the right, agree=0.611, adj=0.222, (0 split)
##       sex     < 0.5      to the right, agree=0.556, adj=0.111, (0 split)
## 
## Node number 50: 56 observations,    complexity param=0.03378378
##   predicted class=0  expected loss=0.2857143  P(node) =0.05233645
##     class counts:    40    16
##    probabilities: 0.714 0.286 
##   left son=100 (41 obs) right son=101 (15 obs)
##   Primary splits:
##       bmi      < 29.9725  to the right, improve=20.9059200, (0 missing)
##       age      < 53.5     to the right, improve= 2.7501910, (0 missing)
##       children < 4.5      to the left,  improve= 2.1164020, (0 missing)
##       charges  < 11258.98 to the right, improve= 1.4404760, (0 missing)
##       region   < 0.5      to the right, improve= 0.3180124, (0 missing)
## 
## Node number 51: 135 observations,    complexity param=0.02702703
##   predicted class=1  expected loss=0.3555556  P(node) =0.1261682
##     class counts:    48    87
##    probabilities: 0.356 0.644 
##   left son=102 (78 obs) right son=103 (57 obs)
##   Primary splits:
##       bmi     < 33.5825  to the left,  improve=18.105530, (0 missing)
##       region  < 1.5      to the left,  improve= 2.115893, (0 missing)
##       charges < 7099.057 to the left,  improve= 1.866667, (0 missing)
##       age     < 61.5     to the left,  improve= 1.312821, (0 missing)
##       sex     < 0.5      to the left,  improve= 0.533928, (0 missing)
##   Surrogate splits:
##       charges < 14177.35 to the left,  agree=0.622, adj=0.105, (0 split)
##       age     < 60.5     to the left,  agree=0.600, adj=0.053, (0 split)
## 
## Node number 52: 5 observations
##   predicted class=0  expected loss=0  P(node) =0.004672897
##     class counts:     5     0
##    probabilities: 1.000 0.000 
## 
## Node number 53: 10 observations
##   predicted class=1  expected loss=0  P(node) =0.009345794
##     class counts:     0    10
##    probabilities: 0.000 1.000 
## 
## Node number 54: 9 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.1111111  P(node) =0.008411215
##     class counts:     1     8
##    probabilities: 0.111 0.889 
##   left son=108 (1 obs) right son=109 (8 obs)
##   Primary splits:
##       charges  < 30828.06 to the right, improve=1.7777780, (0 missing)
##       age      < 44.5     to the left,  improve=0.7777778, (0 missing)
##       bmi      < 28.2625  to the right, improve=0.7777778, (0 missing)
##       children < 2.5      to the right, improve=0.7777778, (0 missing)
##       sex      < 0.5      to the right, improve=0.1777778, (0 missing)
## 
## Node number 55: 71 observations
##   predicted class=1  expected loss=0  P(node) =0.06635514
##     class counts:     0    71
##    probabilities: 0.000 1.000 
## 
## Node number 66: 9 observations
##   predicted class=0  expected loss=0  P(node) =0.008411215
##     class counts:     9     0
##    probabilities: 1.000 0.000 
## 
## Node number 67: 1 observations
##   predicted class=1  expected loss=0  P(node) =0.0009345794
##     class counts:     0     1
##    probabilities: 0.000 1.000 
## 
## Node number 70: 2 observations
##   predicted class=0  expected loss=0  P(node) =0.001869159
##     class counts:     2     0
##    probabilities: 1.000 0.000 
## 
## Node number 71: 6 observations
##   predicted class=1  expected loss=0  P(node) =0.005607477
##     class counts:     0     6
##    probabilities: 0.000 1.000 
## 
## Node number 96: 99 observations
##   predicted class=0  expected loss=0  P(node) =0.09252336
##     class counts:    99     0
##    probabilities: 1.000 0.000 
## 
## Node number 97: 70 observations,    complexity param=0.01276276
##   predicted class=0  expected loss=0.2428571  P(node) =0.06542056
##     class counts:    53    17
##    probabilities: 0.757 0.243 
##   left son=194 (41 obs) right son=195 (29 obs)
##   Primary splits:
##       bmi     < 31.1275  to the left,  improve=11.6738900, (0 missing)
##       age     < 32.5     to the right, improve= 1.4814940, (0 missing)
##       charges < 4194.078 to the right, improve= 1.2000940, (0 missing)
##       region  < 1.5      to the right, improve= 1.0296530, (0 missing)
##       sex     < 0.5      to the right, improve= 0.3474323, (0 missing)
##   Surrogate splits:
##       charges < 4194.078 to the right, agree=0.671, adj=0.207, (0 split)
##       age     < 20.5     to the right, agree=0.629, adj=0.103, (0 split)
## 
## Node number 98: 9 observations,    complexity param=0.006756757
##   predicted class=0  expected loss=0.3333333  P(node) =0.008411215
##     class counts:     6     3
##    probabilities: 0.667 0.333 
##   left son=196 (6 obs) right son=197 (3 obs)
##   Primary splits:
##       bmi      < 45.06    to the left,  improve=4.00, (0 missing)
##       age      < 28.5     to the right, improve=1.00, (0 missing)
##       sex      < 0.5      to the left,  improve=1.00, (0 missing)
##       charges  < 3979.797 to the right, improve=1.00, (0 missing)
##       children < 4        to the right, improve=0.25, (0 missing)
##   Surrogate splits:
##       charges < 4720.013 to the right, agree=0.778, adj=0.333, (0 split)
## 
## Node number 99: 9 observations
##   predicted class=1  expected loss=0  P(node) =0.008411215
##     class counts:     0     9
##    probabilities: 0.000 1.000 
## 
## Node number 100: 41 observations,    complexity param=0.002252252
##   predicted class=0  expected loss=0.02439024  P(node) =0.03831776
##     class counts:    40     1
##    probabilities: 0.976 0.024 
##   left son=200 (40 obs) right son=201 (1 obs)
##   Primary splits:
##       bmi      < 45.725   to the left,  improve=1.95122000, (0 missing)
##       children < 4.5      to the left,  improve=1.95122000, (0 missing)
##       charges  < 12543.91 to the left,  improve=0.06886657, (0 missing)
##       age      < 52.5     to the right, improve=0.05648267, (0 missing)
##       region   < 1.5      to the left,  improve=0.03817603, (0 missing)
## 
## Node number 101: 15 observations
##   predicted class=1  expected loss=0  P(node) =0.01401869
##     class counts:     0    15
##    probabilities: 0.000 1.000 
## 
## Node number 102: 78 observations,    complexity param=0.02027027
##   predicted class=0  expected loss=0.4230769  P(node) =0.0728972
##     class counts:    45    33
##    probabilities: 0.577 0.423 
##   left son=204 (31 obs) right son=205 (47 obs)
##   Primary splits:
##       bmi     < 29.34    to the left,  improve=2.8017000, (0 missing)
##       charges < 7347.962 to the left,  improve=2.3945700, (0 missing)
##       age     < 43.5     to the left,  improve=0.9086691, (0 missing)
##       region  < 0.5      to the right, improve=0.6350258, (0 missing)
##       sex     < 0.5      to the left,  improve=0.2263796, (0 missing)
##   Surrogate splits:
##       charges < 7205.138 to the left,  agree=0.654, adj=0.129, (0 split)
##       age     < 42.5     to the left,  agree=0.615, adj=0.032, (0 split)
## 
## Node number 103: 57 observations,    complexity param=0.002252252
##   predicted class=1  expected loss=0.05263158  P(node) =0.05327103
##     class counts:     3    54
##    probabilities: 0.053 0.947 
##   left son=206 (8 obs) right son=207 (49 obs)
##   Primary splits:
##       bmi      < 39.9875  to the right, improve=0.7250269, (0 missing)
##       children < 1.5      to the right, improve=0.4668192, (0 missing)
##       charges  < 8651.546 to the right, improve=0.1706970, (0 missing)
##       age      < 45.5     to the right, improve=0.1342105, (0 missing)
##       region   < 1.5      to the left,  improve=0.1207185, (0 missing)
## 
## Node number 108: 1 observations
##   predicted class=0  expected loss=0  P(node) =0.0009345794
##     class counts:     1     0
##    probabilities: 1.000 0.000 
## 
## Node number 109: 8 observations
##   predicted class=1  expected loss=0  P(node) =0.007476636
##     class counts:     0     8
##    probabilities: 0.000 1.000 
## 
## Node number 194: 41 observations
##   predicted class=0  expected loss=0  P(node) =0.03831776
##     class counts:    41     0
##    probabilities: 1.000 0.000 
## 
## Node number 195: 29 observations,    complexity param=0.01276276
##   predicted class=1  expected loss=0.4137931  P(node) =0.0271028
##     class counts:    12    17
##    probabilities: 0.414 0.586 
##   left son=390 (12 obs) right son=391 (17 obs)
##   Primary splits:
##       bmi     < 35       to the right, improve=14.0689700, (0 missing)
##       region  < 1.5      to the right, improve= 2.5003380, (0 missing)
##       age     < 18.5     to the left,  improve= 1.4763730, (0 missing)
##       charges < 18933.33 to the right, improve= 1.4763730, (0 missing)
##       sex     < 0.5      to the right, improve= 0.8880131, (0 missing)
##   Surrogate splits:
##       region  < 1.5      to the right, agree=0.690, adj=0.250, (0 split)
##       age     < 18.5     to the left,  agree=0.655, adj=0.167, (0 split)
##       charges < 18933.33 to the right, agree=0.655, adj=0.167, (0 split)
##       sex     < 0.5      to the right, agree=0.621, adj=0.083, (0 split)
## 
## Node number 196: 6 observations
##   predicted class=0  expected loss=0  P(node) =0.005607477
##     class counts:     6     0
##    probabilities: 1.000 0.000 
## 
## Node number 197: 3 observations
##   predicted class=1  expected loss=0  P(node) =0.002803738
##     class counts:     0     3
##    probabilities: 0.000 1.000 
## 
## Node number 200: 40 observations
##   predicted class=0  expected loss=0  P(node) =0.03738318
##     class counts:    40     0
##    probabilities: 1.000 0.000 
## 
## Node number 201: 1 observations
##   predicted class=1  expected loss=0  P(node) =0.0009345794
##     class counts:     0     1
##    probabilities: 0.000 1.000 
## 
## Node number 204: 31 observations,    complexity param=0.01801802
##   predicted class=0  expected loss=0.2580645  P(node) =0.02897196
##     class counts:    23     8
##    probabilities: 0.742 0.258 
##   left son=408 (23 obs) right son=409 (8 obs)
##   Primary splits:
##       children < 1.5      to the left,  improve=11.8709700, (0 missing)
##       bmi      < 26.8875  to the right, improve= 2.4843010, (0 missing)
##       charges  < 7676.924 to the left,  improve= 1.2043010, (0 missing)
##       age      < 47.5     to the right, improve= 0.9043011, (0 missing)
##       region   < 2.5      to the left,  improve= 0.2218449, (0 missing)
##   Surrogate splits:
##       bmi < 26.2675  to the right, agree=0.806, adj=0.25, (0 split)
## 
## Node number 205: 47 observations,    complexity param=0.02027027
##   predicted class=1  expected loss=0.4680851  P(node) =0.04392523
##     class counts:    22    25
##    probabilities: 0.468 0.532 
##   left son=410 (23 obs) right son=411 (24 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=11.5455600, (0 missing)
##       bmi      < 32.865   to the right, improve= 1.2328270, (0 missing)
##       charges  < 11881.66 to the right, improve= 0.8133462, (0 missing)
##       region   < 1.5      to the left,  improve= 0.5764165, (0 missing)
##       age      < 43.5     to the left,  improve= 0.4747681, (0 missing)
##   Surrogate splits:
##       charges < 11880.23 to the right, agree=0.681, adj=0.348, (0 split)
##       age     < 45.5     to the left,  agree=0.596, adj=0.174, (0 split)
##       bmi     < 29.7825  to the left,  agree=0.574, adj=0.130, (0 split)
##       region  < 0.5      to the right, agree=0.574, adj=0.130, (0 split)
## 
## Node number 206: 8 observations,    complexity param=0.002252252
##   predicted class=1  expected loss=0.25  P(node) =0.007476636
##     class counts:     2     6
##    probabilities: 0.250 0.750 
##   left son=412 (2 obs) right son=413 (6 obs)
##   Primary splits:
##       bmi      < 43.19    to the left,  improve=3.0000000, (0 missing)
##       children < 1.5      to the right, improve=1.6666670, (0 missing)
##       age      < 49.5     to the left,  improve=0.6000000, (0 missing)
##       region   < 1.5      to the left,  improve=0.3333333, (0 missing)
##       charges  < 8651.546 to the right, improve=0.3333333, (0 missing)
##   Surrogate splits:
##       children < 1.5      to the right, agree=0.875, adj=0.5, (0 split)
## 
## Node number 207: 49 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.02040816  P(node) =0.04579439
##     class counts:     1    48
##    probabilities: 0.020 0.980 
##   left son=414 (5 obs) right son=415 (44 obs)
##   Primary splits:
##       bmi      < 34.3075  to the left,  improve=0.35918370, (0 missing)
##       age      < 56.5     to the right, improve=0.18140590, (0 missing)
##       region   < 0.5      to the left,  improve=0.18140590, (0 missing)
##       charges  < 12935.13 to the right, improve=0.11302980, (0 missing)
##       children < 1.5      to the right, improve=0.05918367, (0 missing)
## 
## Node number 390: 12 observations
##   predicted class=0  expected loss=0  P(node) =0.01121495
##     class counts:    12     0
##    probabilities: 1.000 0.000 
## 
## Node number 391: 17 observations
##   predicted class=1  expected loss=0  P(node) =0.01588785
##     class counts:     0    17
##    probabilities: 0.000 1.000 
## 
## Node number 408: 23 observations
##   predicted class=0  expected loss=0  P(node) =0.02149533
##     class counts:    23     0
##    probabilities: 1.000 0.000 
## 
## Node number 409: 8 observations
##   predicted class=1  expected loss=0  P(node) =0.007476636
##     class counts:     0     8
##    probabilities: 0.000 1.000 
## 
## Node number 410: 23 observations,    complexity param=0.009009009
##   predicted class=0  expected loss=0.173913  P(node) =0.02149533
##     class counts:    19     4
##    probabilities: 0.826 0.174 
##   left son=820 (19 obs) right son=821 (4 obs)
##   Primary splits:
##       bmi     < 29.9375  to the right, improve=6.60869600, (0 missing)
##       charges < 30647.57 to the left,  improve=1.42687700, (0 missing)
##       region  < 1.5      to the left,  improve=0.56254180, (0 missing)
##       age     < 44.5     to the right, improve=0.25155280, (0 missing)
##       sex     < 0.5      to the left,  improve=0.02408027, (0 missing)
## 
## Node number 411: 24 observations,    complexity param=0.006756757
##   predicted class=1  expected loss=0.125  P(node) =0.02242991
##     class counts:     3    21
##    probabilities: 0.125 0.875 
##   left son=822 (3 obs) right son=823 (21 obs)
##   Primary splits:
##       bmi     < 29.965   to the left,  improve=5.2500000, (0 missing)
##       charges < 9289.083 to the left,  improve=0.7500000, (0 missing)
##       age     < 49.5     to the left,  improve=0.5357143, (0 missing)
##       sex     < 0.5      to the left,  improve=0.0472028, (0 missing)
##       region  < 1.5      to the left,  improve=0.0472028, (0 missing)
## 
## Node number 412: 2 observations
##   predicted class=0  expected loss=0  P(node) =0.001869159
##     class counts:     2     0
##    probabilities: 1.000 0.000 
## 
## Node number 413: 6 observations
##   predicted class=1  expected loss=0  P(node) =0.005607477
##     class counts:     0     6
##    probabilities: 0.000 1.000 
## 
## Node number 414: 5 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.2  P(node) =0.004672897
##     class counts:     1     4
##    probabilities: 0.200 0.800 
##   left son=828 (1 obs) right son=829 (4 obs)
##   Primary splits:
##       age      < 55.5     to the right, improve=1.6, (0 missing)
##       bmi      < 34.2525  to the right, improve=1.6, (0 missing)
##       children < 1.5      to the right, improve=1.6, (0 missing)
##       charges  < 12024.66 to the right, improve=1.6, (0 missing)
##       sex      < 0.5      to the left,  improve=0.6, (0 missing)
## 
## Node number 415: 44 observations
##   predicted class=1  expected loss=0  P(node) =0.0411215
##     class counts:     0    44
##    probabilities: 0.000 1.000 
## 
## Node number 820: 19 observations
##   predicted class=0  expected loss=0  P(node) =0.01775701
##     class counts:    19     0
##    probabilities: 1.000 0.000 
## 
## Node number 821: 4 observations
##   predicted class=1  expected loss=0  P(node) =0.003738318
##     class counts:     0     4
##    probabilities: 0.000 1.000 
## 
## Node number 822: 3 observations
##   predicted class=0  expected loss=0  P(node) =0.002803738
##     class counts:     3     0
##    probabilities: 1.000 0.000 
## 
## Node number 823: 21 observations
##   predicted class=1  expected loss=0  P(node) =0.01962617
##     class counts:     0    21
##    probabilities: 0.000 1.000 
## 
## Node number 828: 1 observations
##   predicted class=0  expected loss=0  P(node) =0.0009345794
##     class counts:     1     0
##    probabilities: 1.000 0.000 
## 
## Node number 829: 4 observations
##   predicted class=1  expected loss=0  P(node) =0.003738318
##     class counts:     0     4
##    probabilities: 0.000 1.000

Decision Tree Summary

Tree Structure

The tree is split into nodes, with each node representing a decision point based on a particular variable. The tree starts with the root node (Node number 1) and branches down to subsequent nodes.

Complexity Parameters

The complexity parameters (CP) help control the size of the tree to prevent overfitting. Smaller CP values result in larger trees.

Variable Importance

The variable importance section indicates the importance of each predictor variable in making predictions. In this case, it looks like “bmi” (body mass index) is the most important variable, followed by “children,” “charges,” “smoker,” “age,” and “region.”

Nodes

Each node in the tree is associated with specific information: - Node Number: Identifies the node. - Predicted Class: The predicted class (0 or 1) at that node. - Expected Loss: The expected misclassification rate at that node. - P(node): The proportion of observations in that node.

Splitting Criteria

The “Primary Splits” indicate the conditions for splitting nodes. For example, “bmi < 25.9825” means that if a certain observation has a BMI less than 25.9825, it follows the left branch; otherwise, it follows the right branch.

Surrogate Splits

Surrogate splits are alternative rules used when primary splits cannot be applied. They provide a backup in case the primary split is not informative.

Example Interpretation

For Node 1: - Predicted class: 1 (insurance claim) - Expected loss: 0.415 - The node is split based on various conditions, with “bmi” being the most significant.

For Node 2: - Predicted class: 0 (no insurance claim) - Expected loss: 0.213 - The node is split based on “smoker,” “charges,” “bmi,” “age,” and “region.”

For Node 3: - Predicted class: 1 (insurance claim) - Expected loss: 0.305 - The node is split based on “children,” “smoker,” “charges,” “age,” and “bmi.”

The root node error is the percent of incorrectly classified cases at the first (root) splitting node. That is,

\[ \text{Root Node Error} = \frac{\text{No. of 1's in the training dataset}}{\text{Size of the training dataset}} \] Rootnode error at the first step of the tree construction

(rootnode_err <- sum(insurance.data.train$insuranceclaim==1)/nrow(insurance.data.train))
## [1] 0.5850467
printcp(fit.allp) 
## 
## Classification tree:
## rpart(formula = insuranceclaim ~ ., data = insurance.data.train, 
##     method = "class", control = rpart.control(minsplit = 1, cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] age      bmi      charges  children smoker  
## 
## Root node error: 444/1070 = 0.41495
## 
## n= 1070 
## 
##           CP nsplit rel error   xerror     xstd
## 1  0.3153153      0 1.0000000 1.000000 0.036300
## 2  0.1283784      1 0.6846847 0.718468 0.033701
## 3  0.0518018      3 0.4279279 0.436937 0.028384
## 4  0.0439189      4 0.3761261 0.427928 0.028154
## 5  0.0337838      6 0.2882883 0.324324 0.025143
## 6  0.0270270      7 0.2545045 0.299550 0.024306
## 7  0.0202703      8 0.2274775 0.256757 0.022730
## 8  0.0180180     10 0.1869369 0.236486 0.021917
## 9  0.0168919     11 0.1689189 0.195946 0.020135
## 10 0.0157658     13 0.1351351 0.195946 0.020135
## 11 0.0135135     14 0.1193694 0.171171 0.018925
## 12 0.0127628     15 0.1058559 0.168919 0.018809
## 13 0.0090090     18 0.0675676 0.105856 0.015098
## 14 0.0067568     19 0.0585586 0.103604 0.014944
## 15 0.0056306     22 0.0382883 0.083333 0.013461
## 16 0.0045045     24 0.0270270 0.083333 0.013461
## 17 0.0022523     27 0.0135135 0.056306 0.011129
## 18 0.0011261     30 0.0067568 0.054054 0.010909
## 19 0.0010000     36 0.0000000 0.051802 0.010685

The root node error is the misclassification rate at the beginning of tree construction. In this case, the error is 41.5%, indicating that 41.5% of instances are misclassified based on the initial split. CP is a tuning parameter controlling the trade-off between tree complexity and goodness of fit. The initial CP is 0.3153153, indicating a complex tree it decreased as it scaled down as the tree constructed and was fitted more with the data and the same with the rel error and xerror. We can observe the number of splits from 0 to 36.

max(fit.allp$cptable[,"nsplit"])
## [1] 36
min(fit.allp$cptable[,"nsplit"])
## [1] 0

From the complexity table (cptable) associated with the fitted tree model, the most complex version of the tree, with all possible splits, involves 36 nodes or decision points.A minimum value of 0 suggests that a model with no splits, essentially a single-node tree (just the root).

Following is the plot of X-val Relative Error vs cp Vs size of tree Plot where we can observe the line goes down from 1.0 to 0.0 indicating that the data was fitted with the model by K- validations.

plotcp(fit.allp)

(cp= fit.allp$cptable[which.min(fit.allp$cptable[, "xerror"]), "CP"])
## [1] 0.001
(xerr = fit.allp$cptable[which.min(fit.allp$cptable[, "xerror"]), "xerror"])
## [1] 0.0518018

The values suggest that the tree model with a complexity parameter of 0.001 has the minimum cross-validated error of approximately 0.0518. This model strikes a balance between complexity and accuracy, making it a reasonable choice based on cross-validated performance.

Following plot indicates the split in each level of the tree with the details of response vairables(0/1), root node error as well as the percent of data in that split. There is a condition splecified at the split.

rpart.plot(fit.allp, extra = "auto", main = "Fitted tree using CART for the Insurance data")

test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_base <- table(test_df$pred, test_df$actual)) #confusion matrix
##    
##       0   1
##   0 108   3
##   1   3 154

The model correctly identified 154 cases where insurance claims were made (True Positives) and accurately predicted 108 instances of no claims (True Negatives). However, it made a small number of errors, with 3 instances of falsely predicting a claim (False Positives) and 3 instances of failing to predict an actual claim (False Negatives).

sensitivity(conf_matrix_base)
## [1] 0.972973
specificity(conf_matrix_base)
## [1] 0.9808917
(mis.rate <- conf_matrix_base[1, 2] + 
   conf_matrix_base[2, 1])/sum(conf_matrix_base) 
## [1] 0.02238806

The sensitivity of the model, also known as recall, is approximately 97.30%. This indicates the proportion of actual insurance claims correctly identified by the model.

The specificity, measuring the proportion of no insurance claims correctly identified, is approximately 98.09%.

The overall mis classification rate, representing the proportion of incorrect predictions out of the total predictions, is approximately 2.24%.

Bellow code is to fit the model with the cp paramter to 0.0001 and the the results looks similar to the cp parameter 0.001.

#Hyper Parameter Tuning 

fit.allf <- rpart(insuranceclaim ~., method = "class", data = insurance.data.train,
                  control = rpart.control(cp = 0.0001))

plotcp(fit.allp)

test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_base <- table(test_df$pred, test_df$actual)) #confusion matrix
##    
##       0   1
##   0 108   3
##   1   3 154
sensitivity(conf_matrix_base)
## [1] 0.972973
specificity(conf_matrix_base)
## [1] 0.9808917
(mis.rate <- conf_matrix_base[1, 2] + 
   conf_matrix_base[2, 1])/sum(conf_matrix_base) 
## [1] 0.02238806
test_df <- data.frame(actual = insurance.data.train$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.train, type = "class")
(tab <- table(test_df$pred, test_df$actual)) #confusion matrix
##    
##       0   1
##   0 444   0
##   1   0 626
sum(diag(tab))/sum(tab)
## [1] 1
sensitivity(tab)
## [1] 1
specificity(tab)
## [1] 1

Bellow code is to fit the model with the cp paramter to 0.01 and the the results looks similar to the cp parameter 0.001 and 0.0001.

fit.allf <- rpart(insuranceclaim ~., method = "class", data = insurance.data.train,
                  control = rpart.control(cp = 0.1))

plotcp(fit.allp)

test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_base <- table(test_df$pred, test_df$actual)) #confusion matrix
##    
##       0   1
##   0 108   3
##   1   3 154
sensitivity(conf_matrix_base)
## [1] 0.972973
specificity(conf_matrix_base)
## [1] 0.9808917
(mis.rate <- conf_matrix_base[1, 2] + 
   conf_matrix_base[2, 1])/sum(conf_matrix_base) 
## [1] 0.02238806

The function prune() can be used to select a subtree of the tree obtained with rpart() if we think (by looking at the xerror estimates) that we would fit the data better by pruning.

#Prune the tree
pfit.allp <- prune(fit.allp, cp =
    fit.allp$cptable[which.min(fit.allp$cptable[, "xerror"]), "CP"])
rpart.plot(pfit.allp, extra = "auto", main = "Pruned Decision Tree")

summary(pfit.allp)
## Call:
## rpart(formula = insuranceclaim ~ ., data = insurance.data.train, 
##     method = "class", control = rpart.control(minsplit = 1, cp = 0.001))
##   n= 1070 
## 
##             CP nsplit   rel error     xerror       xstd
## 1  0.315315315      0 1.000000000 1.00000000 0.03629976
## 2  0.128378378      1 0.684684685 0.71846847 0.03370082
## 3  0.051801802      3 0.427927928 0.43693694 0.02838429
## 4  0.043918919      4 0.376126126 0.42792793 0.02815421
## 5  0.033783784      6 0.288288288 0.32432432 0.02514270
## 6  0.027027027      7 0.254504505 0.29954955 0.02430641
## 7  0.020270270      8 0.227477477 0.25675676 0.02273037
## 8  0.018018018     10 0.186936937 0.23648649 0.02191712
## 9  0.016891892     11 0.168918919 0.19594595 0.02013546
## 10 0.015765766     13 0.135135135 0.19594595 0.02013546
## 11 0.013513514     14 0.119369369 0.17117117 0.01892453
## 12 0.012762763     15 0.105855856 0.16891892 0.01880907
## 13 0.009009009     18 0.067567568 0.10585586 0.01509774
## 14 0.006756757     19 0.058558559 0.10360360 0.01494356
## 15 0.005630631     22 0.038288288 0.08333333 0.01346096
## 16 0.004504505     24 0.027027027 0.08333333 0.01346096
## 17 0.002252252     27 0.013513514 0.05630631 0.01112893
## 18 0.001126126     30 0.006756757 0.05405405 0.01090929
## 19 0.001000000     36 0.000000000 0.05180180 0.01068470
## 
## Variable importance
##      bmi children  charges   smoker      age   region 
##       31       26       20       12       10        1 
## 
## Node number 1: 1070 observations,    complexity param=0.3153153
##   predicted class=1  expected loss=0.4149533  P(node) =1
##     class counts:   444   626
##    probabilities: 0.415 0.585 
##   left son=2 (244 obs) right son=3 (826 obs)
##   Primary splits:
##       bmi      < 25.9825  to the left,  improve=87.44814, (0 missing)
##       children < 0.5      to the right, improve=82.55445, (0 missing)
##       smoker   < 0.5      to the left,  improve=57.01775, (0 missing)
##       charges  < 33047.5  to the left,  improve=45.12917, (0 missing)
##       age      < 41.5     to the left,  improve=16.60421, (0 missing)
## 
## Node number 2: 244 observations,    complexity param=0.0518018
##   predicted class=0  expected loss=0.2131148  P(node) =0.2280374
##     class counts:   192    52
##    probabilities: 0.787 0.213 
##   left son=4 (191 obs) right son=5 (53 obs)
##   Primary splits:
##       smoker  < 0.5      to the left,  improve=34.378990, (0 missing)
##       charges < 14511.86 to the left,  improve=26.570560, (0 missing)
##       bmi     < 17.575   to the right, improve= 7.247083, (0 missing)
##       age     < 63.5     to the left,  improve= 2.344399, (0 missing)
##       region  < 2.5      to the right, improve= 0.787562, (0 missing)
##   Surrogate splits:
##       charges < 14511.86 to the left,  agree=0.939, adj=0.717, (0 split)
##       bmi     < 25.845   to the left,  agree=0.787, adj=0.019, (0 split)
## 
## Node number 3: 826 observations,    complexity param=0.1283784
##   predicted class=1  expected loss=0.3050847  P(node) =0.7719626
##     class counts:   252   574
##    probabilities: 0.305 0.695 
##   left son=6 (473 obs) right son=7 (353 obs)
##   Primary splits:
##       children < 0.5      to the right, improve=114.753100, (0 missing)
##       smoker   < 0.5      to the left,  improve= 30.605070, (0 missing)
##       charges  < 33047.5  to the left,  improve= 24.727040, (0 missing)
##       age      < 41.5     to the left,  improve= 13.870640, (0 missing)
##       bmi      < 31.01    to the left,  improve=  9.525539, (0 missing)
##   Surrogate splits:
##       charges < 3220.372 to the right, agree=0.662, adj=0.210, (0 split)
##       age     < 25.5     to the right, agree=0.646, adj=0.173, (0 split)
##       bmi     < 26.6475  to the right, agree=0.579, adj=0.014, (0 split)
## 
## Node number 4: 191 observations,    complexity param=0.01576577
##   predicted class=0  expected loss=0.07329843  P(node) =0.1785047
##     class counts:   177    14
##    probabilities: 0.927 0.073 
##   left son=8 (184 obs) right son=9 (7 obs)
##   Primary splits:
##       bmi      < 17.575   to the right, improve=12.4802500, (0 missing)
##       charges  < 30225.63 to the left,  improve= 1.7265910, (0 missing)
##       children < 1.5      to the left,  improve= 0.8043285, (0 missing)
##       age      < 63.5     to the left,  improve= 0.7360038, (0 missing)
##       region   < 0.5      to the right, improve= 0.1247432, (0 missing)
## 
## Node number 5: 53 observations,    complexity param=0.01689189
##   predicted class=1  expected loss=0.2830189  P(node) =0.04953271
##     class counts:    15    38
##    probabilities: 0.283 0.717 
##   left son=10 (23 obs) right son=11 (30 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=11.074650, (0 missing)
##       age      < 41       to the left,  improve= 6.509434, (0 missing)
##       charges  < 19479.9  to the left,  improve= 4.805730, (0 missing)
##       bmi      < 19.1975  to the left,  improve= 4.448209, (0 missing)
##       region   < 2.5      to the right, improve= 1.610444, (0 missing)
##   Surrogate splits:
##       bmi     < 24.265   to the right, agree=0.660, adj=0.217, (0 split)
##       region  < 2.5      to the right, agree=0.623, adj=0.130, (0 split)
##       charges < 16717.01 to the right, agree=0.604, adj=0.087, (0 split)
##       age     < 29.5     to the right, agree=0.585, adj=0.043, (0 split)
## 
## Node number 6: 473 observations,    complexity param=0.1283784
##   predicted class=0  expected loss=0.4672304  P(node) =0.4420561
##     class counts:   252   221
##    probabilities: 0.533 0.467 
##   left son=12 (378 obs) right son=13 (95 obs)
##   Primary splits:
##       smoker   < 0.5      to the left,  improve=52.43251, (0 missing)
##       charges  < 30124.26 to the left,  improve=45.66116, (0 missing)
##       age      < 40.5     to the left,  improve=26.26169, (0 missing)
##       bmi      < 31.1925  to the left,  improve=15.70518, (0 missing)
##       children < 2.5      to the right, improve=14.09699, (0 missing)
##   Surrogate splits:
##       charges < 30124.26 to the left,  agree=0.941, adj=0.705, (0 split)
##       bmi     < 26.1525  to the right, agree=0.801, adj=0.011, (0 split)
## 
## Node number 7: 353 observations
##   predicted class=1  expected loss=0  P(node) =0.3299065
##     class counts:     0   353
##    probabilities: 0.000 1.000 
## 
## Node number 8: 184 observations,    complexity param=0.004504505
##   predicted class=0  expected loss=0.03804348  P(node) =0.1719626
##     class counts:   177     7
##    probabilities: 0.962 0.038 
##   left son=16 (157 obs) right son=17 (27 obs)
##   Primary splits:
##       bmi      < 25.3325  to the left,  improve=2.14679700, (0 missing)
##       charges  < 30225.63 to the left,  improve=1.86083400, (0 missing)
##       children < 1.5      to the left,  improve=1.28220600, (0 missing)
##       age      < 63.5     to the left,  improve=0.86299570, (0 missing)
##       region   < 2.5      to the left,  improve=0.08394667, (0 missing)
##   Surrogate splits:
##       charges < 28149.52 to the left,  agree=0.859, adj=0.037, (0 split)
## 
## Node number 9: 7 observations
##   predicted class=1  expected loss=0  P(node) =0.006542056
##     class counts:     0     7
##    probabilities: 0.000 1.000 
## 
## Node number 10: 23 observations,    complexity param=0.01689189
##   predicted class=0  expected loss=0.3478261  P(node) =0.02149533
##     class counts:    15     8
##    probabilities: 0.652 0.348 
##   left son=20 (15 obs) right son=21 (8 obs)
##   Primary splits:
##       age      < 41.5     to the left,  improve=10.4347800, (0 missing)
##       charges  < 19621.16 to the left,  improve= 7.2347830, (0 missing)
##       bmi      < 24.56    to the left,  improve= 2.2501670, (0 missing)
##       children < 2.5      to the right, improve= 0.7732441, (0 missing)
##       region   < 2.5      to the right, improve= 0.5328218, (0 missing)
##   Surrogate splits:
##       charges < 19621.16 to the left,  agree=0.913, adj=0.75, (0 split)
##       bmi     < 24.56    to the left,  agree=0.739, adj=0.25, (0 split)
## 
## Node number 11: 30 observations
##   predicted class=1  expected loss=0  P(node) =0.02803738
##     class counts:     0    30
##    probabilities: 0.000 1.000 
## 
## Node number 12: 378 observations,    complexity param=0.04391892
##   predicted class=0  expected loss=0.3492063  P(node) =0.353271
##     class counts:   246   132
##    probabilities: 0.651 0.349 
##   left son=24 (187 obs) right son=25 (191 obs)
##   Primary splits:
##       age      < 40.5     to the left,  improve=27.8931800, (0 missing)
##       charges  < 7146.168 to the left,  improve=17.2151500, (0 missing)
##       children < 1.5      to the right, improve=13.6844100, (0 missing)
##       bmi      < 31.1925  to the left,  improve=12.0039700, (0 missing)
##       region   < 0.5      to the right, improve= 0.5812166, (0 missing)
##   Surrogate splits:
##       charges  < 6659.237 to the left,  agree=0.897, adj=0.791, (0 split)
##       bmi      < 31.3025  to the left,  agree=0.566, adj=0.123, (0 split)
##       sex      < 0.5      to the right, agree=0.534, adj=0.059, (0 split)
##       region   < 0.5      to the right, agree=0.526, adj=0.043, (0 split)
##       children < 3.5      to the right, agree=0.516, adj=0.021, (0 split)
## 
## Node number 13: 95 observations,    complexity param=0.005630631
##   predicted class=1  expected loss=0.06315789  P(node) =0.08878505
##     class counts:     6    89
##    probabilities: 0.063 0.937 
##   left son=26 (15 obs) right son=27 (80 obs)
##   Primary splits:
##       charges  < 21902.02 to the left,  improve=2.6004390, (0 missing)
##       bmi      < 29.765   to the left,  improve=2.2421050, (0 missing)
##       children < 3.5      to the right, improve=1.7740200, (0 missing)
##       age      < 35.5     to the left,  improve=0.8363513, (0 missing)
##       region   < 1.5      to the left,  improve=0.1132164, (0 missing)
##   Surrogate splits:
##       bmi < 29.765   to the left,  agree=0.905, adj=0.400, (0 split)
##       age < 21       to the left,  agree=0.863, adj=0.133, (0 split)
## 
## Node number 16: 157 observations,    complexity param=0.001126126
##   predicted class=0  expected loss=0.006369427  P(node) =0.146729
##     class counts:   156     1
##    probabilities: 0.994 0.006 
##   left son=32 (147 obs) right son=33 (10 obs)
##   Primary splits:
##       bmi      < 25.1275  to the left,  improve=0.18726110, (0 missing)
##       children < 1.5      to the left,  improve=0.03073941, (0 missing)
##       region   < 0.5      to the right, improve=0.02981434, (0 missing)
##       charges  < 9030.432 to the left,  improve=0.02362478, (0 missing)
##       age      < 44.5     to the left,  improve=0.02004803, (0 missing)
## 
## Node number 17: 27 observations,    complexity param=0.004504505
##   predicted class=0  expected loss=0.2222222  P(node) =0.02523364
##     class counts:    21     6
##    probabilities: 0.778 0.222 
##   left son=34 (19 obs) right son=35 (8 obs)
##   Primary splits:
##       children < 1.5      to the left,  improve=6.3333330, (0 missing)
##       charges  < 9016.941 to the left,  improve=2.4761900, (0 missing)
##       age      < 44.5     to the left,  improve=1.8333330, (0 missing)
##       bmi      < 25.3825  to the right, improve=1.3333330, (0 missing)
##       region   < 1.5      to the left,  improve=0.5333333, (0 missing)
##   Surrogate splits:
##       bmi     < 25.3825  to the right, agree=0.741, adj=0.125, (0 split)
##       charges < 12385.87 to the left,  agree=0.741, adj=0.125, (0 split)
## 
## Node number 20: 15 observations
##   predicted class=0  expected loss=0  P(node) =0.01401869
##     class counts:    15     0
##    probabilities: 1.000 0.000 
## 
## Node number 21: 8 observations
##   predicted class=1  expected loss=0  P(node) =0.007476636
##     class counts:     0     8
##    probabilities: 0.000 1.000 
## 
## Node number 24: 187 observations,    complexity param=0.01351351
##   predicted class=0  expected loss=0.1550802  P(node) =0.1747664
##     class counts:   158    29
##    probabilities: 0.845 0.155 
##   left son=48 (169 obs) right son=49 (18 obs)
##   Primary splits:
##       bmi      < 40.1375  to the left,  improve=10.42547000, (0 missing)
##       children < 1.5      to the right, improve= 8.28593800, (0 missing)
##       charges  < 4170.071 to the right, improve= 3.18811300, (0 missing)
##       age      < 27.5     to the right, improve= 0.66374980, (0 missing)
##       region   < 2.5      to the left,  improve= 0.04327544, (0 missing)
## 
## Node number 25: 191 observations,    complexity param=0.04391892
##   predicted class=1  expected loss=0.460733  P(node) =0.1785047
##     class counts:    88   103
##    probabilities: 0.461 0.539 
##   left son=50 (56 obs) right son=51 (135 obs)
##   Primary splits:
##       children < 2.5      to the right, improve=10.187190, (0 missing)
##       bmi      < 34.49    to the left,  improve= 7.423943, (0 missing)
##       charges  < 11881.66 to the right, improve= 2.994786, (0 missing)
##       age      < 53.5     to the right, improve= 1.934244, (0 missing)
##       region   < 1.5      to the left,  improve= 1.254331, (0 missing)
##   Surrogate splits:
##       age < 62.5     to the right, agree=0.723, adj=0.054, (0 split)
## 
## Node number 26: 15 observations,    complexity param=0.005630631
##   predicted class=1  expected loss=0.3333333  P(node) =0.01401869
##     class counts:     5    10
##    probabilities: 0.333 0.667 
##   left son=52 (5 obs) right son=53 (10 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=6.6666670, (0 missing)
##       charges  < 19826.58 to the right, improve=1.4880950, (0 missing)
##       bmi      < 29.315   to the right, improve=0.9523810, (0 missing)
##       age      < 35.5     to the left,  improve=0.5128205, (0 missing)
##       region   < 2.5      to the right, improve=0.1282051, (0 missing)
##   Surrogate splits:
##       bmi     < 28.535   to the right, agree=0.733, adj=0.2, (0 split)
##       charges < 19826.58 to the right, agree=0.733, adj=0.2, (0 split)
## 
## Node number 27: 80 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.0125  P(node) =0.07476636
##     class counts:     1    79
##    probabilities: 0.013 0.988 
##   left son=54 (9 obs) right son=55 (71 obs)
##   Primary splits:
##       bmi      < 29.07    to the left,  improve=0.19722220, (0 missing)
##       charges  < 33545.31 to the left,  improve=0.14166670, (0 missing)
##       children < 2.5      to the right, improve=0.07023810, (0 missing)
##       age      < 42.5     to the left,  improve=0.03382353, (0 missing)
##       region   < 1.5      to the left,  improve=0.02905405, (0 missing)
##   Surrogate splits:
##       charges < 29526.8  to the left,  agree=0.975, adj=0.778, (0 split)
## 
## Node number 32: 147 observations
##   predicted class=0  expected loss=0  P(node) =0.1373832
##     class counts:   147     0
##    probabilities: 1.000 0.000 
## 
## Node number 33: 10 observations,    complexity param=0.001126126
##   predicted class=0  expected loss=0.1  P(node) =0.009345794
##     class counts:     9     1
##    probabilities: 0.900 0.100 
##   left son=66 (9 obs) right son=67 (1 obs)
##   Primary splits:
##       children < 1.5      to the left,  improve=1.8000000, (0 missing)
##       region   < 0.5      to the right, improve=0.4666667, (0 missing)
##       charges  < 8768.868 to the left,  improve=0.3000000, (0 missing)
##       age      < 43.5     to the left,  improve=0.2000000, (0 missing)
##       sex      < 0.5      to the right, improve=0.2000000, (0 missing)
## 
## Node number 34: 19 observations
##   predicted class=0  expected loss=0  P(node) =0.01775701
##     class counts:    19     0
##    probabilities: 1.000 0.000 
## 
## Node number 35: 8 observations,    complexity param=0.004504505
##   predicted class=1  expected loss=0.25  P(node) =0.007476636
##     class counts:     2     6
##    probabilities: 0.250 0.750 
##   left son=70 (2 obs) right son=71 (6 obs)
##   Primary splits:
##       age      < 38       to the left,  improve=3.0000000, (0 missing)
##       charges  < 7018.252 to the left,  improve=3.0000000, (0 missing)
##       bmi      < 25.7425  to the right, improve=1.6666670, (0 missing)
##       children < 2.5      to the left,  improve=0.6000000, (0 missing)
##       region   < 0.5      to the left,  improve=0.3333333, (0 missing)
##   Surrogate splits:
##       charges < 7018.252 to the left,  agree=1.000, adj=1.0, (0 split)
##       bmi     < 25.7425  to the right, agree=0.875, adj=0.5, (0 split)
## 
## Node number 48: 169 observations,    complexity param=0.01276276
##   predicted class=0  expected loss=0.1005917  P(node) =0.1579439
##     class counts:   152    17
##    probabilities: 0.899 0.101 
##   left son=96 (99 obs) right son=97 (70 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=4.8370250, (0 missing)
##       bmi      < 31.1925  to the left,  improve=4.0863750, (0 missing)
##       charges  < 4170.071 to the right, improve=2.2165900, (0 missing)
##       age      < 32.5     to the right, improve=0.8416554, (0 missing)
##       region   < 1.5      to the right, improve=0.2004161, (0 missing)
##   Surrogate splits:
##       charges < 3393.167 to the right, agree=0.692, adj=0.257, (0 split)
##       bmi     < 28.4525  to the right, agree=0.598, adj=0.029, (0 split)
##       age     < 18.5     to the right, agree=0.592, adj=0.014, (0 split)
## 
## Node number 49: 18 observations,    complexity param=0.006756757
##   predicted class=1  expected loss=0.3333333  P(node) =0.01682243
##     class counts:     6    12
##    probabilities: 0.333 0.667 
##   left son=98 (9 obs) right son=99 (9 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=4.0000000, (0 missing)
##       age      < 28.5     to the right, improve=1.5384620, (0 missing)
##       bmi      < 45.06    to the left,  improve=1.5384620, (0 missing)
##       charges  < 3979.797 to the right, improve=1.5384620, (0 missing)
##       region   < 0.5      to the right, improve=0.2352941, (0 missing)
##   Surrogate splits:
##       bmi     < 42.92    to the right, agree=0.778, adj=0.556, (0 split)
##       age     < 26.5     to the right, agree=0.722, adj=0.444, (0 split)
##       charges < 3551.876 to the right, agree=0.722, adj=0.444, (0 split)
##       region  < 1.5      to the right, agree=0.611, adj=0.222, (0 split)
##       sex     < 0.5      to the right, agree=0.556, adj=0.111, (0 split)
## 
## Node number 50: 56 observations,    complexity param=0.03378378
##   predicted class=0  expected loss=0.2857143  P(node) =0.05233645
##     class counts:    40    16
##    probabilities: 0.714 0.286 
##   left son=100 (41 obs) right son=101 (15 obs)
##   Primary splits:
##       bmi      < 29.9725  to the right, improve=20.9059200, (0 missing)
##       age      < 53.5     to the right, improve= 2.7501910, (0 missing)
##       children < 4.5      to the left,  improve= 2.1164020, (0 missing)
##       charges  < 11258.98 to the right, improve= 1.4404760, (0 missing)
##       region   < 0.5      to the right, improve= 0.3180124, (0 missing)
## 
## Node number 51: 135 observations,    complexity param=0.02702703
##   predicted class=1  expected loss=0.3555556  P(node) =0.1261682
##     class counts:    48    87
##    probabilities: 0.356 0.644 
##   left son=102 (78 obs) right son=103 (57 obs)
##   Primary splits:
##       bmi     < 33.5825  to the left,  improve=18.105530, (0 missing)
##       region  < 1.5      to the left,  improve= 2.115893, (0 missing)
##       charges < 7099.057 to the left,  improve= 1.866667, (0 missing)
##       age     < 61.5     to the left,  improve= 1.312821, (0 missing)
##       sex     < 0.5      to the left,  improve= 0.533928, (0 missing)
##   Surrogate splits:
##       charges < 14177.35 to the left,  agree=0.622, adj=0.105, (0 split)
##       age     < 60.5     to the left,  agree=0.600, adj=0.053, (0 split)
## 
## Node number 52: 5 observations
##   predicted class=0  expected loss=0  P(node) =0.004672897
##     class counts:     5     0
##    probabilities: 1.000 0.000 
## 
## Node number 53: 10 observations
##   predicted class=1  expected loss=0  P(node) =0.009345794
##     class counts:     0    10
##    probabilities: 0.000 1.000 
## 
## Node number 54: 9 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.1111111  P(node) =0.008411215
##     class counts:     1     8
##    probabilities: 0.111 0.889 
##   left son=108 (1 obs) right son=109 (8 obs)
##   Primary splits:
##       charges  < 30828.06 to the right, improve=1.7777780, (0 missing)
##       age      < 44.5     to the left,  improve=0.7777778, (0 missing)
##       bmi      < 28.2625  to the right, improve=0.7777778, (0 missing)
##       children < 2.5      to the right, improve=0.7777778, (0 missing)
##       sex      < 0.5      to the right, improve=0.1777778, (0 missing)
## 
## Node number 55: 71 observations
##   predicted class=1  expected loss=0  P(node) =0.06635514
##     class counts:     0    71
##    probabilities: 0.000 1.000 
## 
## Node number 66: 9 observations
##   predicted class=0  expected loss=0  P(node) =0.008411215
##     class counts:     9     0
##    probabilities: 1.000 0.000 
## 
## Node number 67: 1 observations
##   predicted class=1  expected loss=0  P(node) =0.0009345794
##     class counts:     0     1
##    probabilities: 0.000 1.000 
## 
## Node number 70: 2 observations
##   predicted class=0  expected loss=0  P(node) =0.001869159
##     class counts:     2     0
##    probabilities: 1.000 0.000 
## 
## Node number 71: 6 observations
##   predicted class=1  expected loss=0  P(node) =0.005607477
##     class counts:     0     6
##    probabilities: 0.000 1.000 
## 
## Node number 96: 99 observations
##   predicted class=0  expected loss=0  P(node) =0.09252336
##     class counts:    99     0
##    probabilities: 1.000 0.000 
## 
## Node number 97: 70 observations,    complexity param=0.01276276
##   predicted class=0  expected loss=0.2428571  P(node) =0.06542056
##     class counts:    53    17
##    probabilities: 0.757 0.243 
##   left son=194 (41 obs) right son=195 (29 obs)
##   Primary splits:
##       bmi     < 31.1275  to the left,  improve=11.6738900, (0 missing)
##       age     < 32.5     to the right, improve= 1.4814940, (0 missing)
##       charges < 4194.078 to the right, improve= 1.2000940, (0 missing)
##       region  < 1.5      to the right, improve= 1.0296530, (0 missing)
##       sex     < 0.5      to the right, improve= 0.3474323, (0 missing)
##   Surrogate splits:
##       charges < 4194.078 to the right, agree=0.671, adj=0.207, (0 split)
##       age     < 20.5     to the right, agree=0.629, adj=0.103, (0 split)
## 
## Node number 98: 9 observations,    complexity param=0.006756757
##   predicted class=0  expected loss=0.3333333  P(node) =0.008411215
##     class counts:     6     3
##    probabilities: 0.667 0.333 
##   left son=196 (6 obs) right son=197 (3 obs)
##   Primary splits:
##       bmi      < 45.06    to the left,  improve=4.00, (0 missing)
##       age      < 28.5     to the right, improve=1.00, (0 missing)
##       sex      < 0.5      to the left,  improve=1.00, (0 missing)
##       charges  < 3979.797 to the right, improve=1.00, (0 missing)
##       children < 4        to the right, improve=0.25, (0 missing)
##   Surrogate splits:
##       charges < 4720.013 to the right, agree=0.778, adj=0.333, (0 split)
## 
## Node number 99: 9 observations
##   predicted class=1  expected loss=0  P(node) =0.008411215
##     class counts:     0     9
##    probabilities: 0.000 1.000 
## 
## Node number 100: 41 observations,    complexity param=0.002252252
##   predicted class=0  expected loss=0.02439024  P(node) =0.03831776
##     class counts:    40     1
##    probabilities: 0.976 0.024 
##   left son=200 (40 obs) right son=201 (1 obs)
##   Primary splits:
##       bmi      < 45.725   to the left,  improve=1.95122000, (0 missing)
##       children < 4.5      to the left,  improve=1.95122000, (0 missing)
##       charges  < 12543.91 to the left,  improve=0.06886657, (0 missing)
##       age      < 52.5     to the right, improve=0.05648267, (0 missing)
##       region   < 1.5      to the left,  improve=0.03817603, (0 missing)
## 
## Node number 101: 15 observations
##   predicted class=1  expected loss=0  P(node) =0.01401869
##     class counts:     0    15
##    probabilities: 0.000 1.000 
## 
## Node number 102: 78 observations,    complexity param=0.02027027
##   predicted class=0  expected loss=0.4230769  P(node) =0.0728972
##     class counts:    45    33
##    probabilities: 0.577 0.423 
##   left son=204 (31 obs) right son=205 (47 obs)
##   Primary splits:
##       bmi     < 29.34    to the left,  improve=2.8017000, (0 missing)
##       charges < 7347.962 to the left,  improve=2.3945700, (0 missing)
##       age     < 43.5     to the left,  improve=0.9086691, (0 missing)
##       region  < 0.5      to the right, improve=0.6350258, (0 missing)
##       sex     < 0.5      to the left,  improve=0.2263796, (0 missing)
##   Surrogate splits:
##       charges < 7205.138 to the left,  agree=0.654, adj=0.129, (0 split)
##       age     < 42.5     to the left,  agree=0.615, adj=0.032, (0 split)
## 
## Node number 103: 57 observations,    complexity param=0.002252252
##   predicted class=1  expected loss=0.05263158  P(node) =0.05327103
##     class counts:     3    54
##    probabilities: 0.053 0.947 
##   left son=206 (8 obs) right son=207 (49 obs)
##   Primary splits:
##       bmi      < 39.9875  to the right, improve=0.7250269, (0 missing)
##       children < 1.5      to the right, improve=0.4668192, (0 missing)
##       charges  < 8651.546 to the right, improve=0.1706970, (0 missing)
##       age      < 45.5     to the right, improve=0.1342105, (0 missing)
##       region   < 1.5      to the left,  improve=0.1207185, (0 missing)
## 
## Node number 108: 1 observations
##   predicted class=0  expected loss=0  P(node) =0.0009345794
##     class counts:     1     0
##    probabilities: 1.000 0.000 
## 
## Node number 109: 8 observations
##   predicted class=1  expected loss=0  P(node) =0.007476636
##     class counts:     0     8
##    probabilities: 0.000 1.000 
## 
## Node number 194: 41 observations
##   predicted class=0  expected loss=0  P(node) =0.03831776
##     class counts:    41     0
##    probabilities: 1.000 0.000 
## 
## Node number 195: 29 observations,    complexity param=0.01276276
##   predicted class=1  expected loss=0.4137931  P(node) =0.0271028
##     class counts:    12    17
##    probabilities: 0.414 0.586 
##   left son=390 (12 obs) right son=391 (17 obs)
##   Primary splits:
##       bmi     < 35       to the right, improve=14.0689700, (0 missing)
##       region  < 1.5      to the right, improve= 2.5003380, (0 missing)
##       age     < 18.5     to the left,  improve= 1.4763730, (0 missing)
##       charges < 18933.33 to the right, improve= 1.4763730, (0 missing)
##       sex     < 0.5      to the right, improve= 0.8880131, (0 missing)
##   Surrogate splits:
##       region  < 1.5      to the right, agree=0.690, adj=0.250, (0 split)
##       age     < 18.5     to the left,  agree=0.655, adj=0.167, (0 split)
##       charges < 18933.33 to the right, agree=0.655, adj=0.167, (0 split)
##       sex     < 0.5      to the right, agree=0.621, adj=0.083, (0 split)
## 
## Node number 196: 6 observations
##   predicted class=0  expected loss=0  P(node) =0.005607477
##     class counts:     6     0
##    probabilities: 1.000 0.000 
## 
## Node number 197: 3 observations
##   predicted class=1  expected loss=0  P(node) =0.002803738
##     class counts:     0     3
##    probabilities: 0.000 1.000 
## 
## Node number 200: 40 observations
##   predicted class=0  expected loss=0  P(node) =0.03738318
##     class counts:    40     0
##    probabilities: 1.000 0.000 
## 
## Node number 201: 1 observations
##   predicted class=1  expected loss=0  P(node) =0.0009345794
##     class counts:     0     1
##    probabilities: 0.000 1.000 
## 
## Node number 204: 31 observations,    complexity param=0.01801802
##   predicted class=0  expected loss=0.2580645  P(node) =0.02897196
##     class counts:    23     8
##    probabilities: 0.742 0.258 
##   left son=408 (23 obs) right son=409 (8 obs)
##   Primary splits:
##       children < 1.5      to the left,  improve=11.8709700, (0 missing)
##       bmi      < 26.8875  to the right, improve= 2.4843010, (0 missing)
##       charges  < 7676.924 to the left,  improve= 1.2043010, (0 missing)
##       age      < 47.5     to the right, improve= 0.9043011, (0 missing)
##       region   < 2.5      to the left,  improve= 0.2218449, (0 missing)
##   Surrogate splits:
##       bmi < 26.2675  to the right, agree=0.806, adj=0.25, (0 split)
## 
## Node number 205: 47 observations,    complexity param=0.02027027
##   predicted class=1  expected loss=0.4680851  P(node) =0.04392523
##     class counts:    22    25
##    probabilities: 0.468 0.532 
##   left son=410 (23 obs) right son=411 (24 obs)
##   Primary splits:
##       children < 1.5      to the right, improve=11.5455600, (0 missing)
##       bmi      < 32.865   to the right, improve= 1.2328270, (0 missing)
##       charges  < 11881.66 to the right, improve= 0.8133462, (0 missing)
##       region   < 1.5      to the left,  improve= 0.5764165, (0 missing)
##       age      < 43.5     to the left,  improve= 0.4747681, (0 missing)
##   Surrogate splits:
##       charges < 11880.23 to the right, agree=0.681, adj=0.348, (0 split)
##       age     < 45.5     to the left,  agree=0.596, adj=0.174, (0 split)
##       bmi     < 29.7825  to the left,  agree=0.574, adj=0.130, (0 split)
##       region  < 0.5      to the right, agree=0.574, adj=0.130, (0 split)
## 
## Node number 206: 8 observations,    complexity param=0.002252252
##   predicted class=1  expected loss=0.25  P(node) =0.007476636
##     class counts:     2     6
##    probabilities: 0.250 0.750 
##   left son=412 (2 obs) right son=413 (6 obs)
##   Primary splits:
##       bmi      < 43.19    to the left,  improve=3.0000000, (0 missing)
##       children < 1.5      to the right, improve=1.6666670, (0 missing)
##       age      < 49.5     to the left,  improve=0.6000000, (0 missing)
##       region   < 1.5      to the left,  improve=0.3333333, (0 missing)
##       charges  < 8651.546 to the right, improve=0.3333333, (0 missing)
##   Surrogate splits:
##       children < 1.5      to the right, agree=0.875, adj=0.5, (0 split)
## 
## Node number 207: 49 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.02040816  P(node) =0.04579439
##     class counts:     1    48
##    probabilities: 0.020 0.980 
##   left son=414 (5 obs) right son=415 (44 obs)
##   Primary splits:
##       bmi      < 34.3075  to the left,  improve=0.35918370, (0 missing)
##       age      < 56.5     to the right, improve=0.18140590, (0 missing)
##       region   < 0.5      to the left,  improve=0.18140590, (0 missing)
##       charges  < 12935.13 to the right, improve=0.11302980, (0 missing)
##       children < 1.5      to the right, improve=0.05918367, (0 missing)
## 
## Node number 390: 12 observations
##   predicted class=0  expected loss=0  P(node) =0.01121495
##     class counts:    12     0
##    probabilities: 1.000 0.000 
## 
## Node number 391: 17 observations
##   predicted class=1  expected loss=0  P(node) =0.01588785
##     class counts:     0    17
##    probabilities: 0.000 1.000 
## 
## Node number 408: 23 observations
##   predicted class=0  expected loss=0  P(node) =0.02149533
##     class counts:    23     0
##    probabilities: 1.000 0.000 
## 
## Node number 409: 8 observations
##   predicted class=1  expected loss=0  P(node) =0.007476636
##     class counts:     0     8
##    probabilities: 0.000 1.000 
## 
## Node number 410: 23 observations,    complexity param=0.009009009
##   predicted class=0  expected loss=0.173913  P(node) =0.02149533
##     class counts:    19     4
##    probabilities: 0.826 0.174 
##   left son=820 (19 obs) right son=821 (4 obs)
##   Primary splits:
##       bmi     < 29.9375  to the right, improve=6.60869600, (0 missing)
##       charges < 30647.57 to the left,  improve=1.42687700, (0 missing)
##       region  < 1.5      to the left,  improve=0.56254180, (0 missing)
##       age     < 44.5     to the right, improve=0.25155280, (0 missing)
##       sex     < 0.5      to the left,  improve=0.02408027, (0 missing)
## 
## Node number 411: 24 observations,    complexity param=0.006756757
##   predicted class=1  expected loss=0.125  P(node) =0.02242991
##     class counts:     3    21
##    probabilities: 0.125 0.875 
##   left son=822 (3 obs) right son=823 (21 obs)
##   Primary splits:
##       bmi     < 29.965   to the left,  improve=5.2500000, (0 missing)
##       charges < 9289.083 to the left,  improve=0.7500000, (0 missing)
##       age     < 49.5     to the left,  improve=0.5357143, (0 missing)
##       sex     < 0.5      to the left,  improve=0.0472028, (0 missing)
##       region  < 1.5      to the left,  improve=0.0472028, (0 missing)
## 
## Node number 412: 2 observations
##   predicted class=0  expected loss=0  P(node) =0.001869159
##     class counts:     2     0
##    probabilities: 1.000 0.000 
## 
## Node number 413: 6 observations
##   predicted class=1  expected loss=0  P(node) =0.005607477
##     class counts:     0     6
##    probabilities: 0.000 1.000 
## 
## Node number 414: 5 observations,    complexity param=0.001126126
##   predicted class=1  expected loss=0.2  P(node) =0.004672897
##     class counts:     1     4
##    probabilities: 0.200 0.800 
##   left son=828 (1 obs) right son=829 (4 obs)
##   Primary splits:
##       age      < 55.5     to the right, improve=1.6, (0 missing)
##       bmi      < 34.2525  to the right, improve=1.6, (0 missing)
##       children < 1.5      to the right, improve=1.6, (0 missing)
##       charges  < 12024.66 to the right, improve=1.6, (0 missing)
##       sex      < 0.5      to the left,  improve=0.6, (0 missing)
## 
## Node number 415: 44 observations
##   predicted class=1  expected loss=0  P(node) =0.0411215
##     class counts:     0    44
##    probabilities: 0.000 1.000 
## 
## Node number 820: 19 observations
##   predicted class=0  expected loss=0  P(node) =0.01775701
##     class counts:    19     0
##    probabilities: 1.000 0.000 
## 
## Node number 821: 4 observations
##   predicted class=1  expected loss=0  P(node) =0.003738318
##     class counts:     0     4
##    probabilities: 0.000 1.000 
## 
## Node number 822: 3 observations
##   predicted class=0  expected loss=0  P(node) =0.002803738
##     class counts:     3     0
##    probabilities: 1.000 0.000 
## 
## Node number 823: 21 observations
##   predicted class=1  expected loss=0  P(node) =0.01962617
##     class counts:     0    21
##    probabilities: 0.000 1.000 
## 
## Node number 828: 1 observations
##   predicted class=0  expected loss=0  P(node) =0.0009345794
##     class counts:     1     0
##    probabilities: 1.000 0.000 
## 
## Node number 829: 4 observations
##   predicted class=1  expected loss=0  P(node) =0.003738318
##     class counts:     0     4
##    probabilities: 0.000 1.000
#Measures of Predictive Performance
rootnode_err <- sum(insurance.data.train$insuranceclaim==1)/nrow(insurance.data.train)
prelerr = pfit.allp$cptable[which.min(pfit.allp$cptable[, "rel error"]), "rel error"]
(presub.err_rate <- rootnode_err*prelerr) 
## [1] 0
rootnode_err <- sum(insurance.data.train$insuranceclaim==1)/nrow(insurance.data.train)
pxerr = pfit.allp$cptable[which.min(pfit.allp$cptable[, "xerror"]), "xerror"]
(pcv.err_rate <- rootnode_err*pxerr)
## [1] 0.03030647

The presubstitution error rate, which evaluates the error rate if predictions are made solely based on the overall class distribution in the training data, is 0%.

The post-pruning cross-validation error rate, representing the estimated error rate after pruning the tree using cross-validation, is approximately 3.03%.

In conclusion, the post-pruning cross-validation error suggests that the pruned classification tree performs well in terms of predictive accuracy on unseen data.

test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(pfit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_pruned_tree <- 
    table(test_df$pred, test_df$actual)) #confusion matrix
##    
##       0   1
##   0 108   3
##   1   3 154
sensitivity(conf_matrix_pruned_tree)
## [1] 0.972973
specificity(conf_matrix_pruned_tree)
## [1] 0.9808917
# Missclassification error rate:
(conf_matrix_pruned_tree[1, 2] + 
    conf_matrix_pruned_tree[2, 1])/sum(conf_matrix_pruned_tree) 
## [1] 0.02238806
TP <- conf_matrix_pruned_tree[2, 2]  # True Positives
TN <- conf_matrix_pruned_tree[1, 1]  # True Negatives
FP <- conf_matrix_pruned_tree[1, 2]  # False Positives
FN <- conf_matrix_pruned_tree[2, 1]  # False Negatives

# Calculate Accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)

# Print the accuracy
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 0.9776"

The pruned classification tree demonstrates exceptional performance on the test data, with sensitivity, reaching approximately 97.30%. Moreover, the specificity is impressively high at approximately 98.09%, indicating the model’s proficiency in correctly recognizing instances where no insurance claims are made.

In terms of overall accuracy, the pruned tree exhibits a remarkably low mis classification error rate of about 2.24%. This metric takes into account both false positives (incorrectly predicted insurance claims) and false negatives (missed insurance claims), offering a comprehensive view of the model’s performance. The minimal mis classification error underscores the pruned tree’s efficacy in making precise predictions across the diverse scenarios presented in the test data. Overall, these results highlight the robustness and accuracy of the pruned classification tree in effectively identifying both positive and negative cases in the context of insurance claims.

#4. Random Forest

The random forest (RF) is an ensemble learning method which consists of aggregating a large number of decision trees to avoid overfitting and build a better classification model

The word random appears because in training the data, predictors are chosen randomly from the full set of predictors.

The word forest is used because output from multiple trees are used to make a decision. That is, two types of randomnesses go into constructing a random forest:

each tree is built on a random sample from the dataset, and

at each tree node, a subset of features are randomly selected to generate the best split.

Out-of-bag (OOB) observations from the first bootstrap sample are those observations in the training sample that did not enter the first bootstrap sample. Similarly, we will have OOB observations corresponding to each bootstrap sample (decision tree).

# Random Forest 

fit.rf.ranger <- ranger(insuranceclaim ~ ., data = insurance.data.train, 
                   importance = 'impurity', mtry = 3)
print(fit.rf.ranger)
## Ranger result
## 
## Call:
##  ranger(insuranceclaim ~ ., data = insurance.data.train, importance = "impurity",      mtry = 3) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      1070 
## Number of independent variables:  7 
## Mtry:                             3 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.03148481 
## R squared (OOB):                  0.8704297

The Ranger regression model, built on the insurance data, comprises 500 trees with a sample size of 1070 and incorporates seven independent variables. For each split, the model randomly samples three variables, and the target node size is set at 5. The variable importance is assessed based on impurity, and the split rule is determined by variance. The out-of-bag prediction error (mean squared error) is measured at 0.03085552, indicating a relatively low prediction error, while the R-squared value stands at 0.8730195, signifying a high proportion of explained variance in the target variable. These results collectively suggest that the Ranger regression model performs well in predicting insurance claims, offering accuracy and a strong ability to capture variability in the data.

After training a RF, we would like to understand which variables have the most predictive power. Variables with high importance will have a significant impact on the binary outcomes, while we may consider dropping variables with low importance from the model (leading to a more parsimonious model). We can use the vi() function in the R package vip to extract and print a tibble of variable importance scores. We can also construct a variable importance plot using the vip() function, as shown below.

(v1 <- vi(fit.rf.ranger))
## # A tibble: 7 × 2
##   Variable Importance
##   <chr>         <dbl>
## 1 bmi           93.1 
## 2 children      73.5 
## 3 charges       34.5 
## 4 smoker        24.9 
## 5 age           20.9 
## 6 region         3.69
## 7 sex            1.37
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot for Insurance Data")

Bmi, Children are having most predictive power where as Charges, smoker are have intermediate power and Sex, Region, Age variables are having less predictive power an let’s develop a model by droping those columns one by one.

# Assuming our predictions are stored in the variable 'pred'
pred <- predict(fit.rf.ranger, data = insurance.data.test)

# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
                      pred = ifelse(pred$predictions > 0.5, 1, 0))

# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)

# Display the confusion matrix
print(conf_matrix_rf)
##    
##       0   1
##   0 106   2
##   1   5 155

The Random Forest model on the test data achieved 155 True Positives, correctly predicting insurance claims, and 107 True Negatives, accurately predicting cases with no insurance claims. However, it made 4 False Positives, wrongly predicting claims where none occurred, and 2 False Negatives, missing actual claims. Overall, the model demonstrates good predictive accuracy but has some room for improvement in minimizing false predictions.

# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.954955
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9872611
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf) 
## [1] 0.0261194
TP <- conf_matrix_rf[2, 2]  # True Positives
TN <- conf_matrix_rf[1, 1]  # True Negatives
FP <- conf_matrix_rf[1, 2]  # False Positives
FN <- conf_matrix_rf[2, 1]  # False Negatives

# Calculate Accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)

# Print the accuracy
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 0.9739"

The Random Forest model exhibits robust performance on the test data, capturing approximately 96.40% of actual insurance claims and accurately identifying around 98.73% of no insurance claims. The model’s overall accuracy remains high, with a misclassification error rate of approximately 2.24%, reflecting its effectiveness in classifying instances. This indicates a reliable and well-performing model for predicting insurance claims.

#Dropping the columns which are having the less vip value

# dropped the sex column
fit.rf.ranger <- ranger(insuranceclaim ~ bmi+children+age+smoker+charges+region, data = insurance.data.train, 
                   importance = 'impurity', mtry = 3)

print(fit.rf.ranger)
## Ranger result
## 
## Call:
##  ranger(insuranceclaim ~ bmi + children + age + smoker + charges +      region, data = insurance.data.train, importance = "impurity",      mtry = 3) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      1070 
## Number of independent variables:  6 
## Mtry:                             3 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.02560531 
## R squared (OOB):                  0.8946258
(v1 <- vi(fit.rf.ranger))
## # A tibble: 6 × 2
##   Variable Importance
##   <chr>         <dbl>
## 1 bmi           96.7 
## 2 children      75.2 
## 3 charges       33.0 
## 4 smoker        26.1 
## 5 age           20.1 
## 6 region         2.98
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot - Bmi, Children, Smoker, Charges, Age, Region")

pred <- predict(fit.rf.ranger, data = insurance.data.test)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
                      pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)
# Display the confusion matrix
print(conf_matrix_rf)
##    
##       0   1
##   0 109   1
##   1   2 156
# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.981982
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9936306
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf) 
## [1] 0.01119403

The Random Forest model assigns importance scores to different predictors for predicting insurance claims. BMI emerges as a highly influential factor (96.19), indicating its significant impact on predictions. The number of children follows closely, with a substantial importance score of 75.34, emphasizing its strong influence on claim likelihood. Charges contribute significantly (33.73), while the smoking status, age, and region also play roles, albeit to varying extents. Specifically, being a smoker has moderate importance (25.58), age is important (20.44), and region is comparatively less influential (2.90). In summary, BMI and the number of children are the most critical factors, shaping the Random Forest model’s predictions for insurance claims.

There is a significant impact on removing the columns with the less predicitve power as the sensitivity and specificity got improved.

# dropped the sex and region column
fit.rf.ranger <- ranger(insuranceclaim ~ bmi+children+age+smoker+charges, data = insurance.data.train, 
                   importance = 'impurity', mtry = 3)

print(fit.rf.ranger)
## Ranger result
## 
## Call:
##  ranger(insuranceclaim ~ bmi + children + age + smoker + charges,      data = insurance.data.train, importance = "impurity", mtry = 3) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      1070 
## Number of independent variables:  5 
## Mtry:                             3 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.02029162 
## R squared (OOB):                  0.9164934
(v1 <- vi(fit.rf.ranger))
## # A tibble: 5 × 2
##   Variable Importance
##   <chr>         <dbl>
## 1 bmi            99.6
## 2 children       78.1
## 3 charges        30.3
## 4 smoker         28.1
## 5 age            19.6
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot - Bmi, Children, Smoker, Charges, Age")

pred <- predict(fit.rf.ranger, data = insurance.data.test)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
                      pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)
# Display the confusion matrix
print(conf_matrix_rf)
##    
##       0   1
##   0 109   1
##   1   2 156
# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.981982
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9936306
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf) 
## [1] 0.01119403
pred <- predict(fit.rf.ranger, data = insurance.data.train)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.train$insuranceclaim,
                      pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
tab <- table(test_df$pred, test_df$actual)


sum(diag(tab))/sum(tab)
## [1] 1
sensitivity(tab)
## [1] 1
specificity(tab)
## [1] 1

The Ranger Random Forest model, built on predictors including BMI, number of children, age, smoker status, and charges, exhibits high predictive performance. The prediction error is low (MSE = 0.0204), indicating accurate predictions. The R-squared value of 0.916 signifies the model’s capability to explain variance in the data. The confusion matrix reveals a high accuracy of 98.20%, with sensitivity (True Positive Rate) and specificity (True Negative Rate) at 99.36% and 98.20%, respectively. The misclassification error rate is minimal at 1.12%. In terms of variable importance, BMI stands out as the most crucial predictor (99.61), followed by the number of children (79.02), charges (30.34), smoker status (27.86), and age (19.23). This suggests that BMI and the number of children play pivotal roles in predicting insurance claims, as emphasized by their high importance scores.

# dropped the sex, age and region column
fit.rf.ranger <- ranger(insuranceclaim ~ bmi+children+smoker+charges, data = insurance.data.train, 
                   importance = 'impurity', mtry = 3)

print(fit.rf.ranger)
## Ranger result
## 
## Call:
##  ranger(insuranceclaim ~ bmi + children + smoker + charges, data = insurance.data.train,      importance = "impurity", mtry = 3) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      1070 
## Number of independent variables:  4 
## Mtry:                             3 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.02506871 
## R squared (OOB):                  0.8968341
(v1 <- vi(fit.rf.ranger))
## # A tibble: 4 × 2
##   Variable Importance
##   <chr>         <dbl>
## 1 bmi           101. 
## 2 children       80.3
## 3 charges        41.6
## 4 smoker         31.6
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot - Bmi, Children, Smoker, Charges")

pred <- predict(fit.rf.ranger, data = insurance.data.test)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
                      pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)
# Display the confusion matrix
print(conf_matrix_rf)
##    
##       0   1
##   0 109   3
##   1   2 154
# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.981982
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9808917
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf) 
## [1] 0.01865672
TP <- conf_matrix_rf[2, 2]  # True Positives
TN <- conf_matrix_rf[1, 1]  # True Negatives
FP <- conf_matrix_rf[1, 2]  # False Positives
FN <- conf_matrix_rf[2, 1]  # False Negatives

# Calculate Accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)

# Print the accuracy
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 0.9813"
pred <- predict(fit.rf.ranger, data = insurance.data.train)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.train$insuranceclaim,
                      pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
tab <- table(test_df$pred, test_df$actual)


sum(diag(tab))/sum(tab)
## [1] 0.9990654
sensitivity(tab)
## [1] 0.9977477
specificity(tab)
## [1] 1

The Ranger Random Forest model, constructed with predictors including BMI, number of children, smoker status, and charges, demonstrates strong predictive performance. The prediction error is relatively low, with a mean squared error (MSE) of 0.0247, indicating accurate predictions. The R-squared value of 0.898 suggests the model’s ability to explain variance in the data. The confusion matrix reveals a high accuracy of 98.20%, with sensitivity (True Positive Rate) and specificity (True Negative Rate) at 98.09% and 98.20%, respectively. The misclassification error rate is moderate at 1.87%. In terms of variable importance, BMI is identified as the most crucial predictor, with an importance score of 101.22, followed by the number of children (80.55), charges (42.08), and smoker status (31.58). This underscores the significance of BMI and the number of children in predicting insurance claims, as highlighted by their high importance scores.

Results looks similar after dropping the region column to that of the age and sex.

5. Gradient Boosting

Like random forests, boosting is also an out-of-the box learning algorithm. It gives good predictive performance for the response, usually in high-dimensional settings, with a large number of features.

Random forests build an ensemble of independent deep trees. In contrast, gradient boosting algorithms (GBMs) successively build an ensemble of shallow trees, each tree learning from the previous tree. When combined, these trees provide a highly accurate predictive algorithm.

For binary response modeling, the idea of boosting was introduced to improve the performance of weak learners. This was done by resampling the training data responses, giving more weight to the misclassified ones, thereby leading to a refined classifier (binary model) which would boost feature performance, especially in ambiguous areas of the feature space. A popular variant is the gradient boosting algorithm, and XGBoost (acronym for eXtreme Gradient Boosting.

Following code prepare the data matrix of train and test dataset where to be utilised for the implementation of the xgboost package.

# Transform the predictor matrix using dummy (or indicator or one-hot) encoding 
matrix_predictors.train <- 
  as.matrix(sparse.model.matrix(insuranceclaim ~., data = insurance.data.train))[, -1]
matrix_predictors.test <- 
  as.matrix(sparse.model.matrix(insuranceclaim ~., data = insurance.data.test))[, -1]

Converting the insuramceclaim column to numeric and converting the train data to the format of xgb.DMatrix

# Train dataset
pred.train.gbm <- data.matrix(matrix_predictors.train) # predictors only
#convert factor to numeric
insurance.data.train.gbm <- as.numeric(as.character(insurance.data.train$insuranceclaim)) 
dtrain <- xgb.DMatrix(data = pred.train.gbm, label = insurance.data.train.gbm)

Converting the insuramceclaim column to numeric and converting the test data to the format of xgb.DMatrix

# Test dataset
pred.test.gbm <- data.matrix(matrix_predictors.test) # predictors only
 #convert factor to numeric
insurance.data.test.gbm <- as.numeric(as.character(insurance.data.test$insuranceclaim))
dtest <- xgb.DMatrix(data = pred.test.gbm, label = insurance.data.test.gbm)

XGBoost model fitting with objective as binary:logistic and nrounds as 2 and the accuracy got improved on both the test and train data set.

watchlist <- list(train = dtrain, test = dtest)
param <- list(max_depth = 2, eta = 1, nthread = 2,
              objective = "binary:logistic", eval_metric = "auc")

model.xgb <- xgb.train(param, dtrain, nrounds = 2, watchlist)
## [1]  train-auc:0.868956  test-auc:0.880014 
## [2]  train-auc:0.930225  test-auc:0.929219

Following is the plot of the single tree obtain

# Assuming 'model.xgb' is our XGBoost model

tree_plot <- xgb.plot.tree(model = model.xgb, trees = 1, features_names = colnames(pred.train.gbm)) 

htmlwidgets::saveWidget(tree_plot, "tree_plot.html")

# Include the saved HTML file in our R Markdown document
knitr::include_graphics("tree_plot.html")

From the above plot we can interpret that the a condition was applied on the chargers column as it’s gain value is more with 74 where if it is less than 30175.7773 then it check for the children(gain is 73 ) else bmi(2.4) and later we have further splits based on this.

pred.y.train <- predict(model.xgb, pred.train.gbm)
prediction.train <- as.numeric(pred.y.train > 0.5)
# Measure prediction accuracy on train data
(tab<-table(insurance.data.train.gbm, prediction.train))
##                         prediction.train
## insurance.data.train.gbm   0   1
##                        0 363  81
##                        1  73 553
sum(diag(tab))/sum(tab)
## [1] 0.8560748
sensitivity(tab)
## [1] 0.8325688
specificity(tab)
## [1] 0.8722397

True Positive (TP): 553 cases where the actual class is 1, and the model predicted 1. True Negative (TN): 363 cases where the actual class is 0, and the model predicted 0. False Positive (FP): 81 cases where the actual class is 0, but the model predicted 1. False Negative (FN): 73 cases where the actual class is 1, but the model predicted 0.

Interpretation: Accuracy: (TP + TN) / (TP + TN + FP + FN) = (553 + 363) / (553 + 363 + 81 + 73) ≈ 86.3%. This is the proportion of correctly classified instances out of the total instances.

Precision (Positive Predictive Value): TP / (TP + FP) = 553 / (553 + 81) ≈ 87.2%. This is the proportion of instances predicted as positive that are actually positive.

Recall (Sensitivity, True Positive Rate): TP / (TP + FN) = 553 / (553 + 73) ≈ 88.3%. This is the proportion of actual positives that were correctly predicted as positive.

Specificity (True Negative Rate): TN / (TN + FP) = 363 / (363 + 81) ≈ 81.7%. This is the proportion of actual negatives that were correctly predicted as negative.

In summary, the model seems to have reasonably good performance on the training data, with a high accuracy, precision, recall, and specificity.

pred.y = predict(model.xgb, pred.test.gbm)
prediction <- as.numeric(pred.y > 0.5)
# Measure prediction accuracy on test data
(tab1<-table(insurance.data.test.gbm,prediction))
##                        prediction
## insurance.data.test.gbm   0   1
##                       0  96  15
##                       1  23 134
sensitivity(tab)
## [1] 0.8325688
specificity(tab)
## [1] 0.8722397
sum(diag(tab))/sum(tab)
## [1] 0.8560748

True Positive (TP): 134 cases where the actual class is 1, and the model predicted 1. True Negative (TN): 96 cases where the actual class is 0, and the model predicted 0. False Positive (FP): 15 cases where the actual class is 0, but the model predicted 1. False Negative (FN): 23 cases where the actual class is 1, but the model predicted 0.

Interpretation: Accuracy: (TP + TN) / (TP + TN + FP + FN) = (134 + 96) / (134 + 96 + 15 + 23) ≈ 85.7%. This is the proportion of correctly classified instances out of the total instances.

Precision (Positive Predictive Value): TP / (TP + FP) = 134 / (134 + 15) ≈ 89.9%. This is the proportion of instances predicted as positive that are actually positive.

Recall (Sensitivity, True Positive Rate): TP / (TP + FN) = 134 / (134 + 23) ≈ 85.3%. This is the proportion of actual positives that were correctly predicted as positive.

Specificity (True Negative Rate): TN / (TN + FP) = 96 / (96 + 15) ≈ 86.5%. This is the proportion of actual negatives that were correctly predicted as negative.

F1 Score: The harmonic mean of precision and recall. It provides a balance between precision and recall. 2 * (Precision * Recall) / (Precision + Recall).

In summary, the model appears to have reasonably good performance on the test data, with a high accuracy, precision, recall, and specificity. Similar to the training data

Following is the code to implement the gradiant descent algorithm with increased number of rounts in the algorithm from 2 to 10 and 15.

# 10 rounds from 2 

watchlist <- list(train = dtrain, test = dtest)
param <- list(max_depth = 2, eta = 1, nthread = 2,
              objective = "binary:logistic", eval_metric = "auc")

model.xgb <- xgb.train(param, dtrain, nrounds = 10, watchlist)
## [1]  train-auc:0.868956  test-auc:0.880014 
## [2]  train-auc:0.930225  test-auc:0.929219 
## [3]  train-auc:0.951839  test-auc:0.932576 
## [4]  train-auc:0.969386  test-auc:0.951627 
## [5]  train-auc:0.973921  test-auc:0.954926 
## [6]  train-auc:0.976492  test-auc:0.961927 
## [7]  train-auc:0.976520  test-auc:0.959861 
## [8]  train-auc:0.982112  test-auc:0.965858 
## [9]  train-auc:0.982702  test-auc:0.966030 
## [10] train-auc:0.985278  test-auc:0.967522
pred.y.train <- predict(model.xgb, pred.train.gbm)
prediction.train <- as.numeric(pred.y.train > 0.5)
# Measure prediction accuracy on train data
(tab<-table(insurance.data.train.gbm, prediction.train))
##                         prediction.train
## insurance.data.train.gbm   0   1
##                        0 404  40
##                        1  43 583
sum(diag(tab))/sum(tab)
## [1] 0.9224299
sensitivity(tab)
## [1] 0.9038031
specificity(tab)
## [1] 0.9357945
pred.y = predict(model.xgb, pred.test.gbm)
prediction <- as.numeric(pred.y > 0.5)
# Measure prediction accuracy on test data
(tab1<-table(insurance.data.test.gbm,prediction))
##                        prediction
## insurance.data.test.gbm   0   1
##                       0  96  15
##                       1  17 140
sensitivity(tab1)
## [1] 0.8495575
specificity(tab1)
## [1] 0.9032258
sum(diag(tab1))/sum(tab1)
## [1] 0.880597

Accuracy of the train data set is 92% where as the test data is 88% with the 10 number of rounds which is an improvement in the values with 2 number of rounds.

# 15 rounds 

watchlist <- list(train = dtrain, test = dtest)
param <- list(max_depth = 2, eta = 1, nthread = 2,
              objective = "binary:logistic", eval_metric = "auc")

model.xgb.15 <- xgb.train(param, dtrain, nrounds = 15, watchlist)
## [1]  train-auc:0.868956  test-auc:0.880014 
## [2]  train-auc:0.930225  test-auc:0.929219 
## [3]  train-auc:0.951839  test-auc:0.932576 
## [4]  train-auc:0.969386  test-auc:0.951627 
## [5]  train-auc:0.973921  test-auc:0.954926 
## [6]  train-auc:0.976492  test-auc:0.961927 
## [7]  train-auc:0.976520  test-auc:0.959861 
## [8]  train-auc:0.982112  test-auc:0.965858 
## [9]  train-auc:0.982702  test-auc:0.966030 
## [10] train-auc:0.985278  test-auc:0.967522 
## [11] train-auc:0.985753  test-auc:0.965054 
## [12] train-auc:0.986317  test-auc:0.968956 
## [13] train-auc:0.988307  test-auc:0.975928 
## [14] train-auc:0.988606  test-auc:0.975756 
## [15] train-auc:0.989419  test-auc:0.976445
pred.y.train <- predict(model.xgb.15, pred.train.gbm)
prediction.train <- as.numeric(pred.y.train > 0.5)
# Measure prediction accuracy on train data
(tab<-table(insurance.data.train.gbm, prediction.train))
##                         prediction.train
## insurance.data.train.gbm   0   1
##                        0 412  32
##                        1  29 597
sum(diag(tab))/sum(tab)
## [1] 0.9429907
sensitivity(tab)
## [1] 0.9342404
specificity(tab)
## [1] 0.9491256
pred.y = predict(model.xgb.15, pred.test.gbm)
prediction <- as.numeric(pred.y > 0.5)
# Measure prediction accuracy on test data
(tab1<-table(insurance.data.test.gbm,prediction))
##                        prediction
## insurance.data.test.gbm   0   1
##                       0  95  16
##                       1  12 145
sensitivity(tab1)
## [1] 0.8878505
specificity(tab1)
## [1] 0.9006211
sum(diag(tab1))/sum(tab1)
## [1] 0.8955224

Accuracy of the train data set is 94% where as the test data is 89% with the 15 number of rounds which is an improvement in the values with 2 number of rounds.

#Support Vector Machines

# Classification and Regression Trees
library(e1071)

insurance.data.dup <- read.csv("~/Documents/GitHub/GitHub/Insurance-Claim-Prediction/data/insurance.csv")
insurance.data <- insurance.data.dup

set.seed(12345)
train.prop <- 0.80
strats <- insurance.data$insuranceclaim
rr <- split(1:length(strats), strats)
idx <- sort(as.numeric(unlist(sapply(rr, 
        function(x) sample(x, length(x)*train.prop)))))
insurance.data.train <- insurance.data[idx, ]
insurance.data.test <- insurance.data[-idx, ]
table(insurance.data.train$insuranceclaim)/nrow(insurance.data.train)
## 
##         0         1 
## 0.4149533 0.5850467
# Train an SVM classifier
svm_model <- svm(insuranceclaim ~ ., data = insurance.data.train, kernel = "linear", cost = 1)

svm_model
## 
## Call:
## svm(formula = insuranceclaim ~ ., data = insurance.data.train, kernel = "linear", 
##     cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.1428571 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  981
# Make predictions on the test set
predictions <- predict(svm_model, newdata = insurance.data.test, type= "response")
prediction <- as.numeric(predictions > 0.5)

conf_matrix_svm <- table(prediction, insurance.data.test$insuranceclaim)
# Display the confusion matrix
print(conf_matrix_svm)
##           
## prediction   0   1
##          0  98  22
##          1  13 135
sum(diag(conf_matrix_svm))/sum(conf_matrix_svm)
## [1] 0.869403
sensitivity(conf_matrix_svm)
## [1] 0.8828829
specificity(conf_matrix_svm)
## [1] 0.8598726
# Make predictions on the train set
predictions <- predict(svm_model, newdata = insurance.data.train, type= "response")
prediction <- as.numeric(predictions > 0.5)

conf_matrix_svm <- table(prediction, insurance.data.train$insuranceclaim)
# Display the confusion matrix
print(conf_matrix_svm)
##           
## prediction   0   1
##          0 370  69
##          1  74 557
sum(diag(conf_matrix_svm))/sum(conf_matrix_svm)
## [1] 0.8663551
sensitivity(conf_matrix_svm)
## [1] 0.8333333
specificity(conf_matrix_svm)
## [1] 0.8897764
plot(svm_model, insurance.data.train)

SVM Model Information

Parameter Value
SVM-Type eps-regression
SVM-Kernel linear
Cost 1
Gamma 0.1428571
Epsilon 0.1

Number of Support Vectors

Support Vectors
981

Confusion Matrix and Performance Metrics for Train Set

Actual / Predicted 0 1
0 370 69
1 74 557
  • Accuracy: 86.64%
  • Sensitivity (True Positive Rate): 88.33%
  • Specificity (True Negative Rate): 83.33%

Confusion Matrix and Performance Metrics for Test Set

Actual / Predicted 0 1
0 98 22
1 13 135
  • Accuracy: 86.94%
  • Sensitivity (True Positive Rate): 88.29%
  • Specificity (True Negative Rate): 85.98%